In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors


## Setul de date Book-Crossing

In [86]:
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')

users = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1')

books = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding='latin-1')
print(books.shape)

#Remove duplicate rows for Book-Title
books = books.drop_duplicates(['Book-Title'], keep='last')
print(books.shape)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


(271360, 8)
(242135, 8)


In [87]:
ratings_book_title = pd.merge(ratings, books, on='ISBN')
columns = ['Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M' , 'Image-URL-L']
ratings_book_title = ratings_book_title.drop(columns, axis=1)
ratings_book_title.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


In [88]:
ratings_book_title = ratings_book_title.dropna(axis=0, subset=['Book-Title'])

In [99]:
ratings_book_title.groupby('ISBN')['Book-Rating'].count().reset_index().sort_values('Book-Rating', ascending=False)[:600]

Unnamed: 0,ISBN,Book-Rating
190062,0971880107,2502
33998,0316666343,1295
28695,0312195516,723
81365,0446672211,585
33927,0316601950,568
89370,0452282152,526
6578,0060930535,494
114245,0671021001,468
113762,0671003755,446
7320,0060976845,434


In [97]:
ratings_book_title.groupby('User-ID')['Book-Rating'].count().reset_index().sort_values('Book-Rating', ascending=False)[:600]

Unnamed: 0,User-ID,Book-Rating
2662,11676,8011
50550,198711,5371
25022,98391,4991
39207,153662,4582
8902,35859,4342
54195,212898,3515
71321,278418,3495
28187,110973,2353
59963,235105,2257
58743,230522,2133


Filtram setul de date astfel incat sa ramanem cu useri activi si carti evaluate intr-un numar cat mai mare, deoarece multe carti si multi useri nu au overit mai mult de 200 de rating-uri.

In [91]:
popularity_users_threshold = 200
filter_users = ratings_book_title['User-ID'].value_counts() > popularity_users_threshold
filter_users = filter_users[filter_users].index.tolist()

popularity_books_threshold = 200
filter_books = ratings_book_title['Book-Title'].value_counts() > popularity_books_threshold
filter_books = filter_books[filter_books].index.tolist()

df = ratings_book_title[ratings_book_title['User-ID'].isin(filter_users)]
df = ratings_book_title[ratings_book_title['Book-Title'].isin(filter_books)]
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
2588,276925,316666343,0,The Lovely Bones: A Novel
2589,277195,316666343,0,The Lovely Bones: A Novel
2590,277413,316666343,0,The Lovely Bones: A Novel
2591,277427,316666343,0,The Lovely Bones: A Novel
2592,277439,316666343,7,The Lovely Bones: A Novel


# kNN(k Nearest Neighbors)

kNN is a machine learning algorithm to find clusters of similar users based on common book ratings, and make predictions using the average rating of top-k nearest neighbors.

In [92]:
df_pivot = df.pivot(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)
df_pivot.head()

User-ID,14,165,193,242,243,244,254,332,388,424,...,278535,278552,278582,278633,278645,278692,278755,278798,278832,278843
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Heartbreaking Work of Staggering Genius,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Is for Alibi (Kinsey Millhone Mysteries (Paperback)),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Thousand Acres (Ballantine Reader's Circle),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
A Walk in the Woods: Rediscovering America on the Appalachian Trail (Official Guides to the Appalachian Trail),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
American Gods,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
df_matrix = csr_matrix(df_pivot.values)

In [94]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [95]:
query_index = np.random.choice(df_pivot.shape[0])
distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

for i in range (0, len(distances.flatten())):
    if i == 0:
        print('Recomandari pentru \'{0}\':\n'.format(df_pivot.index[query_index]))
    else:
        print('{0}: \'{1}\''.format(i, df_pivot.index[indices.flatten()[i]]))

Recomandari pentru 'The Nanny Diaries: A Novel':

1: 'The Hours: A Novel'
2: 'Confessions of a Shopaholic (Summer Display Opportunity)'
3: 'We Were the Mulvaneys'
4: 'Fall On Your Knees (Oprah #45)'
5: 'Where the Heart Is (Oprah's Book Club (Paperback))'
