In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing,
# Split
from sklearn.model_selection import train_test_split

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [3]:
df = pd.read_csv("/Users/ratan/Desktop/Documents/recommendation system/ratings_amazon_electronics_products.csv", names=['userId', 'productId','rating','timestamp'])

In [5]:
df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     object 
 1   productId  object 
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


In [9]:
print("Total Reviews:",df.shape[0])
print("Total Columns:",df.shape[1])

Total Reviews: 7824482
Total Columns: 4


In [11]:
print("Total number of ratings :",df.rating.nunique())
print("Total number of users   :", df.userId.nunique())
print("Total number of products  :", df.productId.nunique())

Total number of ratings : 5
Total number of users   : 4201696
Total number of products  : 476002


In [13]:
# Check missing value
df.isnull().sum()

userId       0
productId    0
rating       0
timestamp    0
dtype: int64

In [15]:
# Check Duplicate data
df[df.duplicated()].any()

userId       False
productId    False
rating       False
timestamp    False
dtype: bool

In [17]:
# rating describe summary 
df.describe()['rating']

count    7.824482e+06
mean     4.012337e+00
std      1.380910e+00
min      1.000000e+00
25%      3.000000e+00
50%      5.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [19]:
print("Unique value of Rating:",df.rating.unique())

Unique value of Rating: [5. 1. 3. 2. 4.]


In [21]:
# Find the minimum and maximum ratings
print('Minimum rating is: %d' %(df.rating.min()))
print('Maximum rating is: %d' %(df.rating.max()))

Minimum rating is: 1
Maximum rating is: 5


In [23]:
# Average rating of products
ratings = pd.DataFrame(df.groupby('productId')['rating'].mean())
ratings['ratings_count'] = pd.DataFrame(df.groupby('productId')['rating'].count())
ratings['ratings_average'] = pd.DataFrame(df.groupby('productId')['rating'].mean())
ratings.head(10)

Unnamed: 0_level_0,rating,ratings_count,ratings_average
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0132793040,5.0,1,5.0
0321732944,5.0,1,5.0
0439886341,1.666667,3,1.666667
0511189877,4.5,6,4.5
0528881469,2.851852,27,2.851852
0558835155,3.0,1,3.0
059400232X,5.0,3,5.0
0594012015,2.0,8,2.0
0594017343,1.0,1,1.0
0594017580,3.0,1,3.0


Types of recommender systems:
Content-Based Filtering
Collaborative Filtering
Hybrid Recommender Systems
Popular-Based Recommender Systems
Knowledge-Based Recommender Systems
Reinforcement Learning-Based Recommenders


In [37]:
#popular Based method
vote_counts = ratings[ratings['ratings_count'].notnull()]['ratings_count'].astype('int')
vote_averages = ratings[ratings['ratings_average'].notnull()]['ratings_average'].astype('int')
C = vote_averages.mean()
print("Average rating of product across the whole dataset is",C)

Average rating of product across the whole dataset is 3.6646757786731987


In [39]:
m = vote_counts.quantile(0.95)
print("Minimum votes required to be listed in the chart is",m)

Minimum votes required to be listed in the chart is 55.0


In [41]:
ratings.head()

Unnamed: 0_level_0,rating,ratings_count,ratings_average
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
132793040,5.0,1,5.0
321732944,5.0,1,5.0
439886341,1.666667,3,1.666667
511189877,4.5,6,4.5
528881469,2.851852,27,2.851852


In [43]:
qualified = ratings[(ratings['ratings_count'] >= m) & (ratings['ratings_count'].notnull()) & (ratings['ratings_average'].notnull())][['ratings_count', 'ratings_average']]

In [45]:
qualified['ratings_count'] = qualified['ratings_count'].astype('int')
qualified['ratings_average'] = qualified['ratings_average'].astype('int')
qualified.head().sort_values(by='ratings_count', ascending=False)

Unnamed: 0_level_0,ratings_count,ratings_average
productId,Unnamed: 1_level_1,Unnamed: 2_level_1
972683275,1051,4
1400501466,250,3
1400532620,171,3
1400501776,139,3
1400501520,82,4


In [47]:
qualified.shape

(24014, 2)

In [49]:
def weighted_rating(x):
    v = x['ratings_count']
    R = x['ratings_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [51]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)


In [53]:
qualified = qualified.sort_values('wr', ascending=False).head(20)


In [55]:
qualified.head(10)


Unnamed: 0_level_0,ratings_count,ratings_average,wr
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B0074BW614,18244,4,3.998992
B007WTAJTO,14172,4,3.998704
B0019EHU8G,12285,4,3.998505
B006GWO5WK,12226,4,3.998498
B003ELYQGG,11617,4,3.99842
B003ES5ZUU,10276,4,3.998215
B007R5YDYA,9907,4,3.998149
B00622AG6S,9823,4,3.998133
B0002L5R78,9487,4,3.998067
B008OHNZI0,8966,4,3.997956


In [57]:
# Add color
from matplotlib import cm
color = cm.inferno_r(np.linspace(.4, .8, 30))

rating_plot_count = qualified['ratings_count'].plot.bar(figsize=(12, 4),color=color)
rating_plot_count.set_title("Rating Count Bar-Plot")
rating_plot_count.set_xlabel("productId")
rating_plot_count.set_ylabel("Count")

Text(95.97222222222221, 0.5, 'Count')

In [59]:
rating_plot_avg = qualified['ratings_average'].plot.bar(figsize=(12, 4),color=color)
rating_plot_avg.set_title("Rating Average Bar-Plot")
rating_plot_avg.set_xlabel("productId")
rating_plot_avg.set_ylabel("rating")

Text(95.97222222222221, 0.5, 'rating')

In [61]:
wr_plot = qualified['wr'].plot.bar(figsize=(12, 4),color=color)
wr_plot.set_title("Weight Rating Bar-Plot")
wr_plot.set_xlabel("productId")
wr_plot.set_ylabel("rating")


Text(95.97222222222221, 0.5, 'rating')

In [63]:
#content based
reader = Reader()

In [65]:
df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [67]:
data = Dataset.load_from_df(df[['userId', 'productId', 'rating']], reader)

In [69]:
# Use the famous SVD algorithm
svd = SVD()

# Run 5-fold cross-validation and then print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2926  1.2955  1.2942  1.2934  1.2956  1.2943  0.0012  
MAE (testset)     1.0176  1.0197  1.0184  1.0181  1.0200  1.0188  0.0009  
Fit time          130.92  137.54  133.89  135.86  135.41  134.72  2.23    
Test time         19.01   18.56   18.37   18.67   18.64   18.65   0.21    


{'test_rmse': array([1.29262094, 1.29552842, 1.29420912, 1.29340251, 1.29558284]),
 'test_mae': array([1.01759422, 1.01970984, 1.01836544, 1.0181191 , 1.0199824 ]),
 'fit_time': (130.91605591773987,
  137.5368423461914,
  133.8867211341858,
  135.8565490245819,
  135.41300201416016),
 'test_time': (19.011194229125977,
  18.55985116958618,
  18.374936819076538,
  18.673495054244995,
  18.636547803878784)}

In [71]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a948b1d0>

In [73]:
df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [75]:
df['userId'].value_counts()

userId
A5JLAU2ARJ0BO     520
ADLVFFE4VBT8      501
A3OXHLG6DIBRW8    498
A6FIAB28IS79      431
A680RUE1FDO8B     406
                 ... 
A1IUWX30VMVJGP      1
A1WBP7XSZI6AUL      1
A2K7UNJHE9ZR0G      1
A1A6SIW6EWF6FP      1
A10M2KEFPEQDHN      1
Name: count, Length: 4201696, dtype: int64

In [77]:
# Check specific userId review
df[df['userId'] == 'A3LDPF5FMB782Z']

Unnamed: 0,userId,productId,rating,timestamp
1310,A3LDPF5FMB782Z,1400501466,5.0,1336003200
1588,A3LDPF5FMB782Z,1400501520,5.0,1362873600
1696,A3LDPF5FMB782Z,1400501776,5.0,1334102400
2502,A3LDPF5FMB782Z,140053271X,4.0,1310515200
3235,A3LDPF5FMB782Z,1400698987,4.0,1392854400
...,...,...,...,...
6544365,A3LDPF5FMB782Z,B00995IGBM,3.0,1393977600
7127653,A3LDPF5FMB782Z,B00BLCLUQO,4.0,1389312000
7296614,A3LDPF5FMB782Z,B00COYOAYW,4.0,1380067200
7376630,A3LDPF5FMB782Z,B00DB3SH2G,4.0,1374710400


In [79]:
# predict based on this data
svd.predict('A3LDPF5FMB782Z', '140053271X', 5.0)

Prediction(uid='A3LDPF5FMB782Z', iid='140053271X', r_ui=5.0, est=3.4754682662117315, details={'was_impossible': False})

In [81]:
# predict based on this data
svd.predict('A3LDPF5FMB782Z', 'B00995IGBM', 3.0)

Prediction(uid='A3LDPF5FMB782Z', iid='B00995IGBM', r_ui=3.0, est=3.759665565175136, details={'was_impossible': False})

In [None]:
# predict based on this data
svd.predict('A3LDPF5FMB782Z', 'B00995IGBM', 3.0)