In [218]:
import pandas as pd
import numpy as np

In [219]:
product = pd.read_csv("ratings_Electronics.csv", names = ['userid', 'productid', 'ratings', 'timestamp'])

In [220]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
userid       object
productid    object
ratings      float64
timestamp    int64
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


In [221]:
product.head()

Unnamed: 0,userid,productid,ratings,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [222]:
product.drop(columns = 'timestamp', inplace = True)

In [223]:
product.head()

Unnamed: 0,userid,productid,ratings
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


In [224]:
product.isna().sum()

userid       0
productid    0
ratings      0
dtype: int64

As we see above, no null/NaN data. Now we shall reduce the data based on the condition that the user has met a minimum ratings threshold, say a user has given at least 50 ratings.

In [225]:
product.userid = product.userid.astype(str)
product.productid = product.productid.astype(str)

In [226]:
# Get the count of each value
value_counts = product['userid'].value_counts()

# Select the values where the count is less than 3 (or 5 if you like)
to_remove = value_counts[value_counts <= 50].index

In [227]:
# Keep rows where the userid column is not in to_remove
product_processed = product[~product.userid.isin(to_remove)]

In [228]:
product_processed.info()
#Clean and less sparse CSV For future use
product_processed.to_csv("processed_product.csv")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122171 entries, 118 to 7824444
Data columns (total 3 columns):
userid       122171 non-null object
productid    122171 non-null object
ratings      122171 non-null float64
dtypes: float64(1), object(2)
memory usage: 3.7+ MB


In [229]:
#Now that we have a substantially more manageable data set, we can proceed.
product_processed.head()

Unnamed: 0,userid,productid,ratings
118,AT09WGFUM934H,594481813,3.0
177,A32HSNCNPRUMTR,970407998,1.0
178,A17HMM1M7T9PJ1,970407998,4.0
492,A3CLWR1UUZT6TG,972683275,5.0
631,A3TAS1AG6FMBQW,972683275,5.0


In [230]:
product_processed.shape

(122171, 3)

In [231]:
# Total unique users 
print("total unique users - ",len(product_processed["userid"].unique()))

total unique users -  1466


In [232]:
# Users providing the maximum ratings
product_processed["userid"].value_counts().head()

A5JLAU2ARJ0BO     520
ADLVFFE4VBT8      501
A3OXHLG6DIBRW8    498
A6FIAB28IS79      431
A680RUE1FDO8B     406
Name: userid, dtype: int64

### Visualization

In [233]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = product_processed['ratings'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / product_processed.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} product-ratings'.format(product_processed.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

We can see from above that over half of the ratings for products are 5 starred, therefore it is likely that the top N recommendations will have an average rating around 5 stars.

### Please note that a graph has been generated on local connection which cannot be exported to git. Hence attaching a snapshot of the same separately.

### Collaborative Filtering

### POC: For demo purposes only, prior to actual assignment requirements. We shall create a utiltity matrix with products as columns and users as rows with the rating being the the value for the user item combination i.e. for user u and item i, the rating is r (u,i) - intersection.

In [234]:
ratings_crosstab = product_processed.pivot_table(values='ratings', index='userid', columns='productid', fill_value = 0)

In [235]:
ratings_crosstab.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1466 entries, A100UD67AHFODS to AZOK5STV85FBJ
Columns: 47155 entries, 0594481813 to B00LKG1MC8
dtypes: int64(47155)
memory usage: 527.4+ MB


In [236]:
#Verifying sample known value for accuracy
ratings_crosstab['0594481813']['AT09WGFUM934H']

3

In [237]:
ratings_crosstab.shape

(1466, 47155)

In [238]:
#Transposing the matrix
X = ratings_crosstab.T
X.shape

(47155, 1466)

### As we observe the size of the matrix, and corresponding computation requirements, it would be worthwhile to decompose the matrix by using TruncatedSVD. A brief description of the same is provided below.

This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Contrary to PCA, this estimator does not center the data before computing the singular value decomposition. This means it can work with scipy.sparse matrices efficiently.
Basically, it identifies latent features from the data matrix provided, for more efficient future computation.

In [239]:
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=12, random_state=17)

In [240]:
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(47155, 12)

### Generating a Correlation Matrix

In [241]:
###corr_mat = np.corrcoef(resultant_matrix)
###corr_mat.shape

In [242]:
#As given dataset seems computationally inefficient (expensive) for correlation generation, let us reduce the size 
#of the data by taking a random products sample of 20% and then attempt to generate the correlation matrix

In [243]:
sample_ratings_crosstab = ratings_crosstab.sample(frac = 0.2, random_state = 42, axis = 1)

In [244]:
sample_ratings_crosstab.head()

productid,B002ONCCZC,B00894YWD0,B008RCF5XK,B00BC80UX6,B0034U3KXM,B009QV72VY,B003BEDTBY,B00FX6ST8G,B0057XBZCM,B000300Y9O,...,B002GHBW4S,B00CH5U24A,B002HU27UW,B00004U47J,B0001LXQIG,B007IHEPW2,B000WQ21SQ,B00000J9Z7,B0075GDS60,B002W7U3E2
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A100WO06OQR8BQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A105S56ODHGJEK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A105TOJ6LTVMBG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A10AFVU66A79Y1,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [245]:
sample_ratings_crosstab.shape

(1466, 9431)

In [246]:
X_sample = sample_ratings_crosstab.T
X_sample.shape

(9431, 1466)

In [247]:
SVD_sample = TruncatedSVD(n_components=12, random_state=17)

In [248]:
resultant_matrix_sample = SVD_sample.fit_transform(X_sample)
resultant_matrix_sample.shape

(9431, 12)

In [249]:
corr_mat = np.corrcoef(resultant_matrix_sample)
corr_mat.shape

(9431, 9431)

### Isolating a Certain Product From the Correlation Matrix (say B0034U3KXM)

In [250]:
product_names = sample_ratings_crosstab.columns
product_list = list(product_names)

In [251]:
sample_prod = product_list.index('B0034U3KXM')

In [252]:
print("Index: ", sample_prod)

Index:  4


In [253]:
corr_sample_prod = corr_mat[sample_prod]
corr_sample_prod.shape

(9431,)

In [254]:
list(product_names[(corr_sample_prod<1.0) & (corr_sample_prod > 0.9)])

['B0034U3KXM',
 'B003V13RVE',
 'B003LM2K3Y',
 'B002U4QIBY',
 'B002M78L0U',
 'B00BWA48AA',
 'B005A2QDWC',
 'B0078IWQEU',
 'B00006BB9Q',
 'B0002JUHS4',
 'B002UV4NQO',
 'B000I64NTA',
 'B0042ORPK8',
 'B00065W74G',
 'B001YMCNOE',
 'B00008VSL3',
 'B0038L495U',
 'B000IZC0LY',
 'B00AQNTPBG',
 'B00007KQMC',
 'B00079AGU8',
 'B001LDRNY6',
 'B0046F22ZQ',
 'B00B7RJNPS',
 'B001GC9C9Q',
 'B009YDO05U',
 'B004WIUNZI',
 'B001FB6S0E',
 'B00G69FC48',
 'B00005A8XW',
 'B000NW12D4',
 'B00126PHV8',
 'B0036DDXUC',
 'B001MQ4NJU',
 'B00552PAEO',
 'B003LLPZFK',
 'B004S7ZVQO',
 'B001QTW2BE',
 'B00CHZSV2A',
 'B009JGTVOM',
 'B0007OWNBS',
 'B00A0IICY8',
 'B009GUX85E',
 'B000MQ6W5E',
 'B00612XNSW',
 'B001NPEB7E',
 'B005BLO21Q',
 'B00JLMRXCQ',
 'B000IZC19A',
 'B009QVE6T0',
 'B0042FPA90',
 'B002OJTJIY',
 'B004XXQI1U',
 'B001F8Q0DM',
 'B003YT8YTS',
 'B0007CO8B8',
 'B003EEMFUC',
 'B004PKC9XM',
 'B0060R9P44',
 'B00006JPDE',
 'B001OJI7V0',
 'B000OY7XTS',
 'B002D4AHT0',
 'B0038W0K1G',
 'B004LT1TO2',
 'B000067RC2',
 'B0090COZ

### Recommending a Highly Correlated Product

In [255]:
list(product_names[(corr_sample_prod<1.0) & (corr_sample_prod > 0.98)])

['B0034U3KXM',
 'B002U4QIBY',
 'B00BWA48AA',
 'B005A2QDWC',
 'B0078IWQEU',
 'B00006BB9Q',
 'B002UV4NQO',
 'B000I64NTA',
 'B00065W74G',
 'B001YMCNOE',
 'B00AQNTPBG',
 'B00007KQMC',
 'B00B7RJNPS',
 'B004WIUNZI',
 'B00G69FC48',
 'B00126PHV8',
 'B001QTW2BE',
 'B009JGTVOM',
 'B00612XNSW',
 'B001NPEB7E',
 'B005BLO21Q',
 'B00JLMRXCQ',
 'B0042FPA90',
 'B002OJTJIY',
 'B0007CO8B8',
 'B004PKC9XM',
 'B0060R9P44',
 'B00006JPDE',
 'B001OJI7V0',
 'B000067RC2',
 'B0090COZCS',
 'B007JV5OWM',
 'B00006MJF9',
 'B000BUNKME',
 'B001HOSYO2',
 'B005BCL66Y',
 'B001Q3LU22',
 'B000I21Z7W',
 'B007IIT8KU',
 'B000MMHOY6',
 'B009U4WCY4',
 'B001P9XHHS',
 'B002LZ0FYI',
 'B00069QSK6',
 'B00D4CW0RW',
 'B000TD5G84',
 'B005BGK9JA',
 'B002VCHJC2',
 'B00CE58ZYC',
 'B001C37QUQ',
 'B001CJPNDC',
 'B00139VKV0',
 'B00FCU6NWM',
 'B007R663ME',
 'B009GPY97A',
 'B002ZVAX40',
 'B004M8SU1W',
 'B0024WRS6G',
 'B000YWESMU',
 'B005LS6NNO',
 'B003493ZT2',
 'B000NAQV5K',
 'B007Y0EQE0',
 'B003IEZBLS',
 'B000GBS4KG',
 'B002BH4PP6',
 'B001VHMR

### Above was a rather manual or under-the-hoods method used in some recommendation systems, let us now proceed with our surprise package KNN and SVD based standard procedure. We shall first build a popularity recommender model, before we proceed for neighbourhood based collaborative filtering.

### Popularity Based Filtering

In [256]:
## Recommending based on counts. Products with the most ratings are considered most popular.

In [257]:
rating_count = pd.DataFrame(product_processed.groupby('productid')['ratings'].count())

In [258]:
#Let us now find the most rated products
rating_count.sort_values('ratings', ascending=False).head()

Unnamed: 0_level_0,ratings
productid,Unnamed: 1_level_1
B0088CJT4U,204
B003ES5ZUU,177
B000N99BBC,163
B007WTAJTO,156
B00829TIEK,146


Therefore, the products above seem to have the maximum ratings, indicating popularity. However, popularity recommender models are not effective in practice, primarily due to a lower rate of recall for a diverse user base. Further, popularity recommender models generally produce obvious results, leading to a less effective recommender system.

### Collaborative Filtering

In [259]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [260]:
data = Dataset.load_from_df(product_processed[['userid', 'productid', 'ratings']], reader)

In [261]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.30, random_state = 42)

### Training the model

In [262]:
from surprise import KNNWithMeans
from surprise import accuracy

In [263]:
#From all similarity measures, I choose to use cosine against pearson, spearman rank, jaccard due to its robust and
#fair nature. We shall use item based collaborative filtering, given the use case with K as the default 40.
model_one = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False})
model_one.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x11990f588>

In [264]:
#Which product is most similar to say 'B0034U3KXM'

In [265]:
inner_id = trainset.to_inner_iid("B0034U3KXM")
inner_id

31405

In [266]:
sim_product_inner_ids = model_one.get_neighbors(inner_id, k = 5)
sim_product_inner_ids

[103, 400, 497, 801, 814]

In [267]:
print("Similar Products:")
for iid in sim_product_inner_ids:
    print(trainset.to_raw_iid(iid))

Similar Products:
B005Y8BYOE
B007QXLIWI
B002V8C3W2
B003ES5ZUU
B00DR0PDNE


### Evaluating Model Performance

In [268]:
len(testset)

36652

In [269]:
# Evaluate on the test set
test_pred = model_one.test(testset)

# Compute the RMSE
accuracy.rmse(test_pred)

RMSE: 1.1260


1.125968313593513

In [270]:
# View a prediction
test_pred[12]

Prediction(uid='A2LTYEYGKBYXRR', iid='B000CKVOOY', r_ui=5.0, est=4.83452380952381, details={'actual_k': 6, 'was_impossible': False})

### Generating Top 5 Recommendations

In [271]:
testset_new = trainset.build_anti_testset()

In [272]:
len(testset_new)

54716493

In [273]:
predictions = model_one.test(testset_new[0:10000])

In [274]:
predictions_df = pd.DataFrame([[x.iid,x.est] for x in predictions])

In [275]:
predictions_df.columns = ["productid","estimated_rating"]
predictions_df.sort_values(by = ["productid", "estimated_rating"],ascending=False,inplace=True)

In [276]:
predictions_df.head()

Unnamed: 0,productid,estimated_rating
4136,B00LGQ6HL8,5.0
5252,B00L8I6SFY,5.0
1564,B00L3YHF6O,5.0
7748,B00L2442H0,5.0
8707,B00L21HC7A,4.842776


In [277]:
top_5_recos = predictions_df.groupby("productid").head(5)

In [278]:
top_5_recos.head(5)

Unnamed: 0,productid,estimated_rating
4136,B00LGQ6HL8,5.0
5252,B00L8I6SFY,5.0
1564,B00L3YHF6O,5.0
7748,B00L2442H0,5.0
8707,B00L21HC7A,4.842776


## SVD Based Recommendation

In [279]:
from surprise import SVD
from surprise import accuracy

In [280]:
trainset_svd, testset_svd = train_test_split(data, test_size=.25, random_state=123)

In [281]:
svd_model = SVD(n_factors = 5, biased=False, random_state = 123)
svd_model.fit(trainset_svd)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1bd5429be0>

In [282]:
test_pred_svd = svd_model.test(testset_svd)

In [283]:
# compute RMSE to measure accuracy
accuracy.rmse(test_pred_svd)

RMSE: 1.5830


1.583034009588521

In [284]:
#I found that reducing the factors decreased the error rate. We shall evaluate further using GridSearchCV

In [285]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_factors' : [5,10,15], "reg_all":[0.01,0.02]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3,refit = True)

In [286]:
gs.fit(data)

In [287]:
# get all parameter combinations
gs.param_combinations

[{'n_factors': 5, 'reg_all': 0.01},
 {'n_factors': 5, 'reg_all': 0.02},
 {'n_factors': 10, 'reg_all': 0.01},
 {'n_factors': 10, 'reg_all': 0.02},
 {'n_factors': 15, 'reg_all': 0.01},
 {'n_factors': 15, 'reg_all': 0.02}]

In [288]:
# get best parameters
gs.best_params

{'rmse': {'n_factors': 10, 'reg_all': 0.02}}

In [289]:
# Use the "best model" for prediction
pred_gs = gs.test(testset)

In [290]:
#Check Accuracy with best parameters
gs.best_score

{'rmse': 0.9782170989191622}

The SVD model after finding the best parameters using Grid Search CV definitely performed better registering a lower RMSE error score of 0.977 as against 1.125 with KNN.

In [291]:
#Now, lets evaluate the top 5 recommendations using the best parameters for the SVD model.
testset_svd = trainset_svd.build_anti_testset()

In [292]:
len(testset_svd)

57132216

In [293]:
predictions_svd = gs.test(testset_new[0:10000])

In [294]:
predictions_svd_df = pd.DataFrame([[x.iid,x.est] for x in predictions_svd])

In [295]:
predictions_svd_df.columns = ["productid","estimated_rating"]
predictions_svd_df.sort_values(by = ["productid", "estimated_rating"],ascending=False,inplace=True)

In [296]:
predictions_svd_df.head(5)

Unnamed: 0,productid,estimated_rating
4136,B00LGQ6HL8,4.575451
5252,B00L8I6SFY,4.563632
1564,B00L3YHF6O,4.859051
7748,B00L2442H0,4.725796
8707,B00L21HC7A,4.562033


In [297]:
top_5_recos = predictions_svd_df.groupby("productid").head(5)

In [298]:
print(top_5_recos.head(5))

       productid  estimated_rating
4136  B00LGQ6HL8          4.575451
5252  B00L8I6SFY          4.563632
1564  B00L3YHF6O          4.859051
7748  B00L2442H0          4.725796
8707  B00L21HC7A          4.562033


### The results seem to be very positive since both SVD (finetuned by GridSearchCV) and KNN provide the same top 5 product recommendations, namely:
    1. B00LGQ6HL8
    2. B00L8I6SFY
    3. B00L3YHF6O
    4. B00L2442H0
    5. B00L21HC7A

For evaluation purposes only, let us calculate the precision and recall for our best performing SVD model.

In [299]:
from collections import defaultdict
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each product.'''

    # First map the predictions to each product.
    prod_est_true = defaultdict(list)
    for iid, _, true_r, est, _ in predictions:
        prod_est_true[iid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for iid, product_ratings in prod_est_true.items():

        # Sort product ratings by estimated value
        product_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in product_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in product_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in product_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[iid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[iid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [300]:
precisions, recalls = precision_recall_at_k(pred_gs)

In [301]:
# Precision and recall can then be averaged over all products
print("Precision Score: ", sum(prec for prec in precisions.values()) / len(precisions))
print("Recall Score: ", sum(rec for rec in recalls.values()) / len(recalls))

Precision Score:  0.962308083761019
Recall Score:  0.5488613504592932


Therefore, we have a healthy precision score of around 96% which is the proportion of top results that are relevant and a stable recall score of 54.9% which is the proportion of all relevant results included in the top recommendations