In [24]:
# dataframes and read data
import pandas as pd

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Surprise algorithm and other modules
from surprise import Reader,KNNBasic,Dataset,accuracy,SVD,dump
from surprise.model_selection import cross_validate

import pickle # saving model

In [25]:
data = pd.read_csv("./ratings.csv") # read dataset

In [26]:
data.head(5)

Unnamed: 0,userid,pid,rating,Name
0,1,ACCEJWHTY2HVMG7Z5,5,Redmi 12
1,1,JEAE9PP49AH6PQCW,5,Wrangler Blue Jeans
2,2,ACCEJWHTY2HVMG7Z5,5,Redmi 12
3,2,ACCEJGYYMTQR4JXY,4,Back cover Redmi 12
4,2,ACCEF2R7ZWEDG674,4,Tempered Glass Redmi 12


# Model

In [27]:
# we will take productId, Author, reviwRating Columns for Surpirse model

# suprise takes input columns as - AuthorId, ProductId, reviewRating , for which we will add Author ID later

df = data[['userid','pid','rating']]
df

Unnamed: 0,userid,pid,rating
0,1,ACCEJWHTY2HVMG7Z5,5
1,1,JEAE9PP49AH6PQCW,5
2,2,ACCEJWHTY2HVMG7Z5,5
3,2,ACCEJGYYMTQR4JXY,4
4,2,ACCEF2R7ZWEDG674,4
5,3,ACCEJWHTY2HVMG7Z5,5
6,3,ACCEJGYYMTQR4JXY,4
7,3,ACCEA9K5ZCDNRGXN,4
8,3,ACCEF2R7ZWEDG674,4
9,4,ACCEJWHTY2HVMG7Z5,5


In [28]:
df.nunique()  # unique value count in the Data

userid    10
pid       12
rating     3
dtype: int64

In [29]:
df.isnull().sum()  # null value count in the Data

userid    0
pid       0
rating    0
dtype: int64

In [30]:
# Reader is part of surprise which takes argument "rating_scale" for reading the Dataset
# Dataset.load_from_df is used to read data from the DataFrame

reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df, reader)

In [31]:
# here peason is the similarity option - which will tell to calculate the pearson similarity for predictions
# user_based - False means the similarity is calculated between Items and True means similarity is calculated between users


# user - Based
similarity_options = {'name': 'pearson','user_based': True}  # Set the similarity measure and user-based option
algouser = KNNBasic(k=20, min_k=1, sim_options=similarity_options)  # Create a KNNBasic algorithm with specified parameters

In [32]:
trainset = dataset.build_full_trainset() # build the trainset on the whole data

In [33]:
algouser.fit(trainset) # fit the model on trainset

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x17feecd3850>

In [34]:
# here peason is the similarity option - which will tell to calculate the pearson similarity for predictions
# user_based - False means the similarity is calculated between Items and True means similarity is calculated between users


# item Based
similarity_options = {'name': 'pearson','user_based': False}  # Set the similarity measure and user-based option
algoitem = KNNBasic(k=20, min_k=1, sim_options=similarity_options)  # Create a KNNBasic algorithm with specified parameters

In [35]:
algoitem.fit(trainset) # fit the model on trainset

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x17feecd15d0>

In [36]:
# create a pivot table with row as productIds and Columns as authorId and the cells display the coressponding Ratings

prod_auth_pt = df.pivot_table(index='pid',columns='userid',values='rating')
prod_auth_pt

userid,1,2,3,4,5,6,7,8,9,10
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACCDWYMTHZYE2JPA,,,,3.0,4.0,,,,,
ACCEA9K5ZCDNRGXN,,,4.0,,,,,,,
ACCEF2R7ZWEDG674,,4.0,4.0,5.0,5.0,,,,,
ACCEJGYYMTQR4JXY,,4.0,4.0,,,,,,,
ACCEJGYYYCGETMGA,,,,,3.0,,,,,
ACCEJWHTY2HVMG7Z5,5.0,5.0,5.0,5.0,5.0,,,,,
JEAE9PP49AH6PQCW,5.0,,,,,5.0,5.0,3.0,5.0,5.0
SWSE55BVG4GYJDSU,,,,,,,5.0,,3.0,
SWSE9CCKH7JFGGZH,,,,,,,,,4.0,4.0
SWSEA8FHKGKZ8YFT,,,,,,,5.0,,,


In [37]:
# Count the number of missing values in the pivot table
missing_values = prod_auth_pt.isnull().sum().sum()

# Calculate the total number of cells in the pivot table
total_cells = prod_auth_pt.size

# Calculate the percentage of sparse data
sparse_percentage = (missing_values / total_cells) * 100

# Print the result
print(f"The percentage of sparse data in the pivot table is: {sparse_percentage:.2f}%")

The percentage of sparse data in the pivot table is: 74.17%


In [38]:
# recommend the items to the user Using KNN algorithm

# pass the algo parameter to set the algorithm, i.e. userbased or itembased
def recommend(algo,user):
    
    
    # get the items data for the particular User
    items = prod_auth_pt[user]
    
    # make predictions for all the Items
    predictions = [algo.predict(user,item_id,rating) for item_id,rating in items.items()]
    
    return predictions

In [39]:
recommend(algouser,1) # get recommendations for the user - 26 -- user based recommendations

[Prediction(uid=1, iid='ACCDWYMTHZYE2JPA', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEA9K5ZCDNRGXN', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEF2R7ZWEDG674', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEJGYYMTQR4JXY', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEJGYYYCGETMGA', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEJWHTY2HVMG7Z5', r_ui=5.0, est=5, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=1, iid='JEAE9PP49AH6PQCW', r_ui=5.0, est=5, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=1, iid='SWSE55BVG4GYJDSU', r_ui

In [40]:
recommend(algoitem,1) # get recommendations for the user - 26 -- item based recommendations

[Prediction(uid=1, iid='ACCDWYMTHZYE2JPA', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEA9K5ZCDNRGXN', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEF2R7ZWEDG674', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEJGYYMTQR4JXY', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEJGYYYCGETMGA', r_ui=nan, est=4.419354838709677, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=1, iid='ACCEJWHTY2HVMG7Z5', r_ui=5.0, est=5, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=1, iid='JEAE9PP49AH6PQCW', r_ui=5.0, est=5, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=1, iid='SWSE55BVG4GYJDSU', r_ui

In [41]:
# read the data from the Datframe to get data in the Surprise format

svd_data = Dataset.load_from_df(df, reader)

In [42]:
# initialize the algorithm and perform cross validation for the kfolds

svd = SVD(n_factors=10)
cross_validate(svd, svd_data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.92196022, 0.62848135, 0.80036935, 0.61165307, 1.08151884]),
 'test_mae': array([0.78678094, 0.60356991, 0.68080681, 0.58897847, 0.90577353]),
 'fit_time': (0.005667448043823242, 0.0, 0.0, 0.0, 0.0),
 'test_time': (0.0008864402770996094, 0.0, 0.0, 0.0, 0.0)}

In [43]:
# Get recommendations for the users based on SVD (Singular Value Decomposition)

def prediction(userId):
    
    # function to add the Predicted rating in with other data
    def addrating(x):
        for row in top_predictions:
            if x == row[1]:
                return row[3]
    
    # the number of recommendation to give for particular user
    items_to_recommend = 30
    
    # prediction for each product for the particular user
    predictions = [svd.predict(userId,itemid,rating) for itemid,rating in prod_auth_pt[userId].items()]
    
    
    # Sort the predictions based on the estimated rating (`est`) in descending order and selects the top `items_to_recommend` items as the recommendations.
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:items_to_recommend]
    

    # get only the ProductId and Predicted Rating fro each of the top Item
    topitemsId = [row[1] for row in top_predictions]
    
    # here we add the predicted rating column in to dataframe

    recommended_items = data[data['pid'].isin(topitemsId)][['pid','Name']].drop_duplicates(subset='pid')
    recommended_items['predictedRating'] = recommended_items['pid'].apply(addrating)
    
    
    return recommended_items

In [44]:
recommended_items = prediction(1) # get the top 15 items recommended to the particular User

In [45]:
# these are the recommended items for the user sorted in decreasing order

recommended_items.sort_values(by='predictedRating',ascending=False)

Unnamed: 0,pid,Name,predictedRating
0,ACCEJWHTY2HVMG7Z5,Redmi 12,4.796541
1,JEAE9PP49AH6PQCW,Wrangler Blue Jeans,4.756909
23,SWSEBHCPTESRGGGR,Levi's Cream Shirt,4.710384
4,ACCEF2R7ZWEDG674,Tempered Glass Redmi 12,4.656208
15,ACCEJGYYYCGETMGA,Back cover Redmi 12i,4.627297
20,SWSEA8FHKGKZ8YFT,Wrangler White Casual Shirt,4.627297
3,ACCEJGYYMTQR4JXY,Back cover Redmi 12,4.595119
7,ACCEA9K5ZCDNRGXN,Realme Charger 15W Type C,4.57874
27,SWSE9CCKH7JFGGZH,Mufti Demin Jeans,4.575409
17,SWSEBS2UXYZBSY2Y,Sweat Shirt,4.552888


In [46]:
# save all the models in pickle file
dump.dump('svd.pkl',algo=svd)

# save the pivot table to get user-item rating data

pickle.dump(prod_auth_pt,open('pivot_table.pkl','wb'))