### 1. IMPORT LIBERARY ###

In [1]:

import pandas as pd
import seaborn as sns
from surprise import SVD

ModuleNotFoundError: No module named 'surprise'

### 2. LOAD DATA SETS USING PANDAS ###

In [2]:
ratings = pd.read_csv('booksRating.csv')
ratings

FileNotFoundError: [Errno 2] No such file or directory: 'booksRating.csv'

### 3. CLEAN DATA ###

## 3a. Drop not used columns ##
Book Ratings has multi dimennsional data as it has review summary, review text, category, publisher, author etc. 

User interest varies from person to person. 

People can like the books from perticular author or from a perticular category or can very depending on the review description.

Trying to recormmend books considering all these part makes the system complex and plotting them with respect to reach other result in high dimention.

So to keep the recommendation simple we are just using the rating of a book, thus we are dropping columns other then title, user and rating.

In [3]:
ratings.drop(columns=["Id","Price","profileName","review/helpfulness","review/time","review/summary","review/text"],inplace=True)
ratings.rename(columns= {'review/score' : 'rating', 'User_id':'user','Title':'title'}, inplace=True)
ratings

Unnamed: 0,title,user,rating
0,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,4.0
1,Dr. Seuss: American Icon,A30TK6U7DNS82R,5.0
2,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,5.0
3,Dr. Seuss: American Icon,A2MVUWT453QH61,4.0
4,Dr. Seuss: American Icon,A22X4XUPKF66MR,4.0
...,...,...,...
2999995,The Idea of History,,4.0
2999996,The Idea of History,A1SMUB9ASL5L9Y,4.0
2999997,The Idea of History,A2AQMEKZKK5EE4,4.0
2999998,The Idea of History,A18SQGYBKS852K,5.0


### 3b. FILTER USERS WITH MIN 150 RATINGS ###
Amazon book review dataset has 3M ratings, and apply SVD on such large dataset led to Memory issue, so decided to use only those users that have given minimum 150 ratings.

In [4]:
users_Grouped_By_Rating = ratings.groupby('user')['rating']
books_groupby_users_Ratings = pd.DataFrame(users_Grouped_By_Rating.count())

user_list_min_ratings = books_groupby_users_Ratings[books_groupby_users_Ratings['rating'] >= 150].index
ratings =  ratings[ratings['user'].isin(user_list_min_ratings)]
ratings

Unnamed: 0,title,user,rating
1,Dr. Seuss: American Icon,A30TK6U7DNS82R,5.0
6,Dr. Seuss: American Icon,A14OJS0VWMOSWO,5.0
59,Rising Sons and Daughters: Life Among Japan's ...,A3NIQK6ZLYEP1L,4.0
61,Muslim Women's Choices: Religious Belief and S...,ATDE9JYCPI0L1,2.0
140,Eyewitness Travel Guide to Europe,A281NPSIMI1C2R,5.0
...,...,...,...
2999949,Very Bad Deaths: Library Edition,AFVQZQ8PW0L,5.0
2999960,My Life,A9Q28YTLYREO7,5.0
2999971,My Life,A22LX6529JJ166,5.0
2999979,"Old Quebec,: The fortress of New France,",A32ZKBXJJ45BRY,3.0


### 3c. Remove the rating that have no titles###

In [5]:
display(ratings.isna().sum().sort_values())

print('we can observe that there are **"2 Missing Title"** in the data as shown below, Lets remove that row')

print(ratings[ratings['title'].isna()])

user      0
rating    0
title     2
dtype: int64

we can observe that there are **"2 Missing Title"** in the data as shown below, Lets remove that row
        title            user  rating
1278792   NaN  A21NVBFIEQWDSG     4.0
2272114   NaN  A2FNM5JVHZP0IW     5.0


In [6]:
ratings = ratings.dropna()
display(ratings.isna().sum().sort_values())
print(ratings[ratings['title'].isna()])

print('we can now observe there are **"NO Missing Title"** in the data')

title     0
user      0
rating    0
dtype: int64

Empty DataFrame
Columns: [title, user, rating]
Index: []
we can now observe there are **"NO Missing Title"** in the data


### 4. VISUALIZATION ###

In [7]:
display(ratings[['rating']].describe().transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rating,169474.0,4.351493,0.893993,1.0,4.0,5.0,5.0,5.0


In [8]:
print('Total unique users in the dataset', ratings['user'].nunique())
print('Total unique products in the dataset', ratings['title'].nunique())

Total unique users in the dataset 537
Total unique products in the dataset 52973


In [9]:
from surprise.model_selection import RandomizedSearchCV
from surprise.dataset import Dataset
from surprise.reader import Reader
reader = Reader()

params= {
    "n_epochs": [5, 10, 15, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}

ratings_Surprise_Data = Dataset.load_from_df(ratings, reader)

clf = RandomizedSearchCV(SVD, params, n_jobs=-1, measures=['rmse'])
clf.fit(ratings_Surprise_Data)     
print(clf.best_score)
print(clf.best_params)
print(clf.best_estimator)  

{'rmse': 0.7767523328561098}
{'rmse': {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}}
{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x00000116A3B9AC70>}


In [10]:
svd = clf.best_estimator['rmse']
svd

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x116a3b9ac70>

In [11]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(ratings_Surprise_Data, test_size=.3, random_state=10)
print('**Training and Testing Set Distribution**')
testset


**Training and Testing Set Distribution**


[('Flying Colours', 'AHHUNTFQGUGX8', 5.0),
 ('Lilies On The Moon', 'A14OJS0VWMOSWO', 5.0),
 ('Wildcats Version 3.0: Full Disclosure', 'A1X8VZWTOG8IS6', 4.0),
 ('The House of Scorta', 'A14OJS0VWMOSWO', 5.0),
 ('Anything for Billy', 'A2TAPSNKK9AFSQ', 5.0),
 ('Emma', 'A1D2C0WDCSHUWZ', 5.0),
 ('Space Cadet', 'A2BZCHIDR5FTYA', 4.0),
 ('Lord of the Flies', 'AU6DIIDZK2OQM', 5.0),
 ('Night And Day (Classic Books on Cassettes Collection) [UNABRIDGED]',
  'A2SI6BNK5SWSMD',
  3.0),
 ('Cosmic consciousness;: A study in the evolution of the human mind',
  'A1RJD10TTI568L',
  4.0),
 ('Just One Look', 'A2SI6BNK5SWSMD', 3.0),
 ('Arcadia: A play', 'A126KX6FVI4T66', 5.0),
 ('Directors Take Two', 'A26JGAM6GZMM4V', 5.0),
 ('Motion to Suppress', 'AHUT55E980RDR', 3.0),
 ('Dune', 'A4FX5YCJA630V', 5.0),
 ('The Hobbit', 'AAIL33CYCT47J', 5.0),
 ('The Rise and Fall of the Third Reich', 'ALR35EFI69S5R', 5.0),
 ('Twenty Thousand Leagues Under the Sea', 'AXVXIWJ6IU6KA', 4.0),
 ('The Windows of Heaven', 'A3RP1I0SL70

In [12]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [13]:
from surprise import accuracy

print('**Fitting the train data...**')
svd.fit(trainset)       

print('**Predicting the test data...**')
pred_test = svd.test(testset)

rmse = round(accuracy.rmse(pred_test), 3)
print('**RMSE for the predicted result is ' + str(rmse) + '**')   

top_n = get_top_n(pred_test)
recommenddf = pd.DataFrame(columns=['title', 'user', 'rating'])
for item in top_n:
    subdf = pd.DataFrame(top_n[item], columns=['user', 'rating'])
    subdf['title'] = item
    cols = subdf.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    subdf = subdf[cols]
    recommenddf = pd.concat([recommenddf, subdf], axis = 0)


**Fitting the train data...**
**Predicting the test data...**
RMSE: 0.7794
**RMSE for the predicted result is 0.779**


In [14]:
recommenddf

Unnamed: 0,title,user,rating
0,Flying Colours,A3AYGXMH34R18Y,4.699601
1,Flying Colours,AHHUNTFQGUGX8,4.648454
2,Flying Colours,A1K1JW1C5CUSUZ,4.588041
3,Flying Colours,A1LMBM1N4EXS5W,4.263654
0,Lilies On The Moon,A14OJS0VWMOSWO,4.800427
...,...,...,...
0,"Mind, Matter and Quantum Mechanics (The Fronti...",A2OJW07GQRNJUT,4.747840
0,The Koran Interpreted: A Translation,A2Z4KA3EFQWZOX,4.215260
0,American Jezebel : The Uncommon Life of Anne H...,A2Z4KA3EFQWZOX,4.262879
0,"Buffy contre les vampires, tome 26 : La Tueuse...",A2NJO6YE954DBH,4.458985


In [15]:
from surprise.model_selection.validation import cross_validate
print('**Cross Validating the data...**')
cv_result = cross_validate(svd, ratings_Surprise_Data, n_jobs=-1)
cv_result = round(cv_result['test_rmse'].mean(),3)
print('**Mean CV RMSE is ' + str(cv_result)  + '**')

**Cross Validating the data...**
**Mean CV RMSE is 0.777**


In [16]:
def recommend(user, n=5):
    print('**Recommending top ' + str(n)+ ' products for userid : ' + user + ' ...**')
    df = recommenddf[recommenddf['user'] == user].head(n)
    return df

In [19]:

result_svd_user1 = recommend(user='A3AYGXMH34R18Y', n=5)
result_svd_user1

**Recommending top 5 products for userid : A3AYGXMH34R18Y ...**


Unnamed: 0,title,user,rating
0,Flying Colours,A3AYGXMH34R18Y,4.699601
0,"Travels With Charley, In Search of America",A3AYGXMH34R18Y,4.598884
6,The Plague,A3AYGXMH34R18Y,4.725475
3,Of Mice and Men (Penguin Audiobooks),A3AYGXMH34R18Y,4.76479
4,Of Mice and Men Hb (New Windmill),A3AYGXMH34R18Y,4.787625


In [21]:

result_svd_user2 = recommend(user='A1LMBM1N4EXS5W', n=5)
result_svd_user2

**Recommending top 5 products for userid : A1LMBM1N4EXS5W ...**


Unnamed: 0,title,user,rating
3,Flying Colours,A1LMBM1N4EXS5W,4.263654
3,The Golden Compass,A1LMBM1N4EXS5W,4.339286
5,Kim (Modern Library Classics),A1LMBM1N4EXS5W,4.068062
6,Kim (Modern Library Classics),A1LMBM1N4EXS5W,4.068062
4,Rosemary's Baby,A1LMBM1N4EXS5W,3.676699


In [22]:

result_svd_user3 = recommend(user='A2YXRT2XIJIO57', n=5)
result_svd_user3

**Recommending top 5 products for userid : A2YXRT2XIJIO57 ...**


Unnamed: 0,title,user,rating
0,A People's History of the Vietnam War,A2YXRT2XIJIO57,4.530493
3,Heart of Darkness. (Heritage Club Series),A2YXRT2XIJIO57,4.418928
1,THE SUN ALSO RISES,A2YXRT2XIJIO57,4.44593
2,Kim,A2YXRT2XIJIO57,4.227001
9,A Portrait of the Artist as a Young Man,A2YXRT2XIJIO57,4.476279
