In [1]:
import pandas as pd  # Load and manipulate data, provides data structure in form of data
import numpy as np   # 
import scipy as sp

In [2]:
data = pd.read_csv('BX-Book-Ratings.csv', sep = ';',header=0,names=["user","isbn","rating"],error_bad_lines=False)
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
data.dtypes

user       int64
isbn      object
rating     int64
dtype: object

In [4]:
data.shape

(1149780, 3)

In [5]:
books = pd.read_csv('BX-Books.csv', sep = ';',usecols=[0,1,2],index_col=0,names=["isbn","title","author"],error_bad_lines=False)
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
ISBN,Book-Title,Book-Author
0195153448,Classical Mythology,Mark P. O. Morford
0002005018,Clara Callan,Richard Bruce Wright
0060973129,Decision in Normandy,Carlo D'Este
0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata


In [6]:
books.dtypes

title     object
author    object
dtype: object

In [7]:
books.isnull().sum()

title     0
author    1
dtype: int64

In [8]:
# function which will print the title and author for any given ISBN
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
print(bookMeta("0195153448"))

('Classical Mythology', 'Mark P. O. Morford')


In [9]:
# function which print top n fev books
def favBook(user,N):
    userRatings = data[data["user"]==user] #pic the rows which are relivent to the userdata base i.e Filter data relivent to the user
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N]#now sort the raings in ascending order on the basis of ratings
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta) #apply metadata function to the entire ISBN column
    return sortedRatings

In [10]:
data = data[data["isbn"].isin(books.index)] #file may inconsistent hence subset data on basis of index which are present

In [11]:
favBook(204622,5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


# Construct the Rating matrix

In [12]:
data.shape #1149780,3

(1031175, 3)

In [13]:
userPerISBN = data.isbn.value_counts()#distinct isbn count 
userPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [14]:
userPerISBN.shape #340556

(270170,)

In [15]:
ISBNPerUser = data.user.value_counts()#unique user
ISBNPerUser.shape#63912

(92107,)

In [16]:
data = data[data["isbn"].isin(userPerISBN[userPerISBN>10].index)]#subset the data books read by more than 10 users

In [17]:
data = data[data["user"].isin(ISBNPerUser[ISBNPerUser>10].index)]

In [18]:
userItemRatingMatrix = pd.pivot_table(data, values='rating',index=['user'],columns=['isbn'])#.fillna(0)
#userItemRatingMatrix.isbn  = userItemRatingMatrix.isbn.astype(np.int32)
#userItemRatingMatrix.rating = userItemRatingMatrix.rating.astype(np.int32)
userItemRatingMatrix.head(10)

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
383,,,,,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,
408,,,,,,,,,,,...,,,,,,,,,,
424,,,,,,,,,,,...,,,,,,,,,,
446,,,,,,,,,,,...,,,,,,,,,,


In [19]:
userItemRatingMatrix = pd.pivot_table(data, values='rating',index=['user'],columns=['isbn'])#.fillna(0)
userItemRatingMatrix.head(10)

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
383,,,,,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,
408,,,,,,,,,,,...,,,,,,,,,,
424,,,,,,,,,,,...,,,,,,,,,,
446,,,,,,,,,,,...,,,,,,,,,,


## Find K Nearest Neighbors

1. Set up a function to compute distance between two users

In [28]:
user1= 204622
user2 = 255489
userRat = userItemRatingMatrix.transpose()[user1]#row becomecolum and column become row
userRat.head()

isbn
0002005018   NaN
0002251760   NaN
0002259834   NaN
0002558122   NaN
0006480764   NaN
Name: 204622, dtype: float64

In [29]:
user2Rat = userItemRatingMatrix.transpose()[user2]#row become colum and column become row
user2Rat.head()

isbn
0002005018   NaN
0002251760   NaN
0002259834   NaN
0002558122   NaN
0006480764   NaN
Name: 255489, dtype: float64

In [30]:
from scipy.spatial.distance import hamming

In [31]:
hamming(userRat,user2Rat)

0.9999352792699502

Let's put all these steps in a function 

In [32]:
def distance(user1, user2):
    try:
        user1Ratings = userItemRatingMatrix.transpose()[user1]
        user2Ratings = userItemRatingMatrix.transpose()[user2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN #NaN comes from numpy 
    return distance

function call

In [33]:
distance(204622,255489)

0.9999352792699502

### Find distance from Active user

In [34]:
user = 204622
allUser = pd.DataFrame(userItemRatingMatrix.index)
allUser = allUser[allUser.user!=user]
allUser.head()

Unnamed: 0,user
0,8
1,99
2,242
3,243
4,254


In [35]:
allUser['distance'] = allUser["user"].apply(lambda x: distance(user,x))#add new colum which is distance of active user from other

In [36]:
allUser.head()

Unnamed: 0,user,distance
0,8,1.0
1,99,1.0
2,242,0.999935
3,243,0.999935
4,254,1.0


In [37]:
k =10
knearestUser = allUser.sort_values(['distance'],ascending="True")["user"][:k]

In [38]:
knearestUser

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

Let's Write a function to bunch all together

In [39]:
def nearesrNeighbors(user, k=10):
    allUser = pd.DataFrame(userItemRatingMatrix.index)
    allUser = allUser[allUser.user!=user]
    allUser["distance"] = allUser["user"].apply(lambda x : distance(user,x))
    KnearestUser = allUser.sort_values(["distance"],ascending=True)["user"][:k]
    return KnearestUser

In [40]:
KnearestUser= nearesrNeighbors(user)

In [41]:
knearestUser

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

1. Average the ratings of nearest neighbors for unread books
2. Sort in descending order
3. Pick the top N

Average the rating using nearest neighbor

In [42]:
NRatinngs = userItemRatingMatrix[userItemRatingMatrix.index.isin(knearestUser)]

In [43]:
NRatinngs

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7346,,,,,,,,,,,...,,,,,,,,,,
16795,,,,,,,,,,,...,,,,,,,,,,
48046,,,,,,,,,,,...,,,,,,,,,,
68555,,,,,,,,,,,...,,,,,,,,,,
82893,,,,,,,,,,,...,,,,,,,,,,
87555,,,,,,,,,,,...,,,,,,,,,,
140036,,,,,,,,,,,...,,,,,,,,,,
198711,,,,,,,,,,,...,,,,,,,,,,
232131,,,,,,,,,,,...,,,,,,,,,,
251422,,,,,,,,,,,...,,,,,,,,,,


In [44]:
avgRating = NRatinngs.apply(np.nanmean).dropna()
avgRating.head()

  result = libreduction.compute_reduction(


isbn
0007154615    1.5
0020125305    0.0
0020125607    0.0
0020198817    0.0
0020198906    8.0
dtype: float64

In [45]:
booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index

In [46]:
booksAlreadyRead

Index(['006016848X', '0060935464', '0140042598', '0140178724', '0142004278',
       '0380732238', '0385504209', '0425109720', '0425152898', '0440136482',
       '0440241162', '0451191145', '0451197127', '0553096060', '0671027360',
       '0671027387', '0671666258', '0688174574', '0743225708', '076790592X',
       '0785264280', '0786868716', '0802131867', '0802132952', '0971880107',
       '1853260045', '1853260126', '1853260207', '185326041X', '1878424114'],
      dtype='object', name='isbn')

In [47]:
avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]

In [48]:
N=3
topNisbn = avgRating.sort_values(ascending = False).index[:N]

In [49]:
pd.Series(topNisbn).apply(bookMeta)

0              (Love, Greg &amp; Lauren, Greg Manning)
1    (The Two Towers (The Lord of the Rings, Part 2...
2    (Harry Potter and the Sorcerer's Stone (Book 1...
Name: isbn, dtype: object