In [1]:
import pandas as pd  # Load and manipulate data, provides data structure in form of data
import numpy as np   
import scipy as sp

## Reading CSV Files

###  1. Books Rating

In [2]:
data = pd.read_csv('BX-Book-Ratings.csv', sep = ';',header=0,names=["user","isbn","rating"],error_bad_lines=False)
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
#view data types
data.dtypes

user       int64
isbn      object
rating     int64
dtype: object

### 2. Books Information

In [4]:
books = pd.read_csv('BX-Books.csv', sep = ';',header=0,usecols=[0,1,2],index_col=0,names=["isbn","title","author"],error_bad_lines=False)
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


### 1. Function to get title and author  using ISBN No

In [5]:
'''function which will print the title and author for any given ISBN''' 
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
bookMeta("0002005018")

('Clara Callan', 'Richard Bruce Wright')

### 2. Function to get Favourite book using ratings

In [6]:
'''function which print top n fev books'''
def favBook(user,N):
    #pick the rows which are relivent to the user database i.e Filter data relivent to the user
    userRatings = data[data["user"]==user]

    #now sort the raings in descending order on the basis of ratings
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N]
    
    #apply metadata function to the entire ISBN column
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta) 
    
    return sortedRatings

In [7]:
'''file may contain inconsistency hence subset the data on basis of index which are present'''
data = data[data["isbn"].isin(books.index)] 

In [8]:
#function call
favBook(204622,5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


In [9]:
data.shape 

(1031175, 3)

# Constructing the Rating Matrix

In [10]:
userPerISBN = data.isbn.value_counts()#distinct isbn count 
userPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [11]:
userPerISBN.shape 

(270170,)

In [12]:
ISBNPerUser = data.user.value_counts() #unique user
ISBNPerUser.shape

(92107,)

In [13]:
'''Subeset the data for books read by more than 10 users'''
data = data[data["isbn"].isin(userPerISBN[userPerISBN>10].index)]

In [14]:
'''Now let's subset on the basis of users isbn '''
data = data[data["user"].isin(ISBNPerUser[ISBNPerUser>10].index)]

In [15]:
userItemRatingMatrix = pd.pivot_table(data, values='isbn',index=['user'],columns=['isbn'])

userRatingMatrix = userRatingMatrix.astype(np.int32)

KeyError: 'isbn'

In [None]:
userItemRatingMatrix.head(10)

In [18]:
''' Rating matrix generation'''
userItemRatingMatrix = data.pivot(index='user',columns = 'isbn',values = 'isbn')
user =userItemRatingMatrix.index
isnb = userItemRatingMatrix.columns

#userItemRatingMatrix = userItemRatingMatrix.astype(np.int32)
print(userItemRatingMatrix.shape)
userItemRatingMatrix.head()

(10706, 15451)


isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,2005018.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


# Find K-nearest Neighbour

1. Set up a function to compute distance between two users

In [19]:
''' Step by step process --> Grab matrix for user1'''
user1= 204622
user2 = 255489
userRat = userItemRatingMatrix.transpose()[user1]#row become column and column become row
userRat.head()#see the head data

isbn
0002005018    NaN
0002251760    NaN
0002259834    NaN
0002558122    NaN
0006480764    NaN
Name: 204622, dtype: object

In [20]:
'''Now repeate the same  for user2'''
user2Rat = userItemRatingMatrix.transpose()[user2]#row become colum and column become row
user2Rat.head() #See the head data

isbn
0002005018    NaN
0002251760    NaN
0002259834    NaN
0002558122    NaN
0006480764    NaN
Name: 255489, dtype: object

In [31]:
from scipy.spatial.distance import hamming

In [32]:
'''find the nearest distance using hamming distance'''
hamming(userRat,user2Rat)

0.9999352792699502

In [33]:
'''Lets grab it into a function''' 
def distance(user1, user2):
    try:
        user1Ratings = userItemRatingMatrix.transpose()[user1]
        user2Ratings = userItemRatingMatrix.transpose()[user2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN #NaN comes from numpy 
    return distance

In [34]:
# Functionn call
distance(204622,255489)

0.9999352792699502

# Find distance from active user

In [35]:
'''Remove the active user from users list'''
user = 204622
allUser = pd.DataFrame(userItemRatingMatrix.index) #user id's from all users
allUser = allUser[allUser.user!=user] #remove userID of active user
allUser.head()

Unnamed: 0,user
0,8
1,99
2,242
3,243
4,254


In [38]:
'''add new colum which is distance of active user from other'''
allUser['distance'] = allUser["user"].apply(lambda x: distance(user,x))

In [37]:
allUser.head()

Unnamed: 0,user,distance
0,8,1.0
1,99,1.0
2,242,0.999935
3,243,0.999935
4,254,1.0


## Average the rating using nearest neighbor

In [39]:
k =10
knearestUser = allUser.sort_values(['distance'],ascending="True")["user"][:k]

In [41]:
knearestUser

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

In [42]:
'''Lets write a function to wrap it together'''
def nearesrNeighbors(user, k=10):
    allUser = pd.DataFrame(userItemRatingMatrix.index) #get only user's ID and store it into allUser
    allUser = allUser[allUser.user!=user] #Remove active user from users list
    
    allUser["distance"] = allUser["user"].apply(lambda x : distance(user,x)) #apply lambda function to add DISTANCE column 
    #which is the distance of active user from other users
    
    KnearestUser = allUser.sort_values(["distance"],ascending=True)["user"][:k] #sortthe values in descending ord
    
    return KnearestUser

In [44]:
knearestUser = nearesrNeighbors(user)

In [45]:
knearestUser

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

1. Average the ratings of nearest neighbors for unread books
2. Sort in descending order
3. Pick the top N

Average the rating using nearest neighbor

In [43]:
NRatinngs = userItemRatingMatrix[userItemRatingMatrix.index.isin(knearestUser)]

In [44]:
NRatinngs

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7346,,,,,,,,,,,...,,,,,,,,,,
16795,,,,,,,,,,,...,,,,,,,,,,
48046,,,,,,,,,,,...,,,,,,,,,,
68555,,,,,,,,,,,...,,,,,,,,,,
82893,,,,,,,,,,,...,,,,,,,,,,
87555,,,,,,,,,,,...,,,,,,,,,,
140036,,,,,,,,,,,...,,,,,,,,,,
198711,,,,,,,,,,,...,,,,,,,,,,
232131,,,,,,,,,,,...,,,,,,,,,,
251422,,,,,,,,,,,...,,,,,,,,,,


In [48]:
avgRating = NRatinngs.apply(np.nanmean).dropna()
avgRating.head()

isbn
0007154615    1.5
0020125305    0.0
0020125607    0.0
0020198817    0.0
0020198906    8.0
dtype: float64

In [54]:
booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index

In [55]:
booksAlreadyRead

Index(['006016848X', '0060935464', '0140042598', '0140178724', '0142004278',
       '0380732238', '0385504209', '0425109720', '0425152898', '0440136482',
       '0440241162', '0451191145', '0451197127', '0553096060', '0671027360',
       '0671027387', '0671666258', '0688174574', '0743225708', '076790592X',
       '0785264280', '0786868716', '0802131867', '0802132952', '0971880107',
       '1853260045', '1853260126', '1853260207', '185326041X', '1878424114'],
      dtype='object', name='isbn')

In [57]:
avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]

In [58]:
N=3
topNisbn = avgRating.sort_values(ascending = False).index[:N]

In [59]:
pd.Series(topNisbn).apply(bookMeta)

0              (Love, Greg &amp; Lauren, Greg Manning)
1    (The Two Towers (The Lord of the Rings, Part 2...
2    (Harry Potter and the Sorcerer's Stone (Book 1...
Name: isbn, dtype: object

In [72]:
def topN(user,N=3):
    KnearestUser = nearesrNeighbors(user)
    NRatinngs = userItemRatingMatrix[userItemRatingMatrix.index.isin(knearestUser)]#get nearest neb for user
    avgRating = NRatinngs.apply(np.nanmean).dropna()
    
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    
    topNisbn = avgRating.sort_values(ascending = False).index[:N]#sort desc ord
    return pd.Series(topNisbn).apply(bookMeta)

In [73]:
favBook(204813,10)

Unnamed: 0,user,isbn,rating,title
845417,204813,399149848,10,"(Birthright, Nora Roberts)"
845407,204813,385504209,10,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9,"(The Wedding, Nicholas Sparks)"


In [74]:
topN(204813,10)

  result = libreduction.compute_reduction(


0    (Dandelion Wine (Grand Master Editions), RAY B...
1                    (Range of Motion, Elizabeth Berg)
2                     (Brave New World, Aldous Huxley)
3    (The Dive From Clausen's Pier (Alex Awards), A...
4        (A Gracious Plenty : A Novel, SHERI REYNOLDS)
5     (All Things Bright and Beautiful, James Herriot)
6                        (Tending Roses, Lisa Wingate)
7    (Small Sacrifices: A True Story of Passion and...
8    (Wild at Heart: Discovering the Secret of a Ma...
9                    (Drums of Autumn, DIANA GABALDON)
Name: isbn, dtype: object