In [1]:
import pandas as pd
import numpy as np
import math
from scipy.sparse.linalg import svds
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import NMF
from numpy import linalg as LA
from sklearn.metrics import mean_squared_error
from math import sqrt

**Import Article file which contains all the information about the article the has interacted with**

In [3]:
articles_df = pd.read_csv('article.csv')
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
#removing unwanted columns
articles_df = articles_df.drop(articles_df.columns[[0, 1, 3,4,5,6,7,8,11,12]], axis=1)
articles_df.head()

Unnamed: 0,contentId,url,title
1,-4.110354e+18,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact..."
2,-7.292285e+18,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...
3,-6.151852e+18,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour
4,2.448027e+18,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc..."
5,-2.826566e+18,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...


In [4]:
#check for null values
articles_df.isnull().values.any()

False

**Import the user interaction file which will give us the information about how eah user interacted with each articles**

In [5]:
df = pd.read_csv('User_Interaction.csv')

In [6]:
df = df.drop(df.columns[[0,4,5,6,7]], axis=1)
df.head()

Unnamed: 0,eventType,contentId,personId
0,VIEW,-3.499919e+18,-8845298781299428018
1,VIEW,8.890721e+18,-1032019229384696495
2,VIEW,3.105155e+17,-1130272294246983140
3,FOLLOW,3.105155e+17,344280948527967603
4,VIEW,-7.820641e+18,-445337111692715325


In [7]:
# there are about 72312 entries in this dataset
df.shape

(72312, 3)

In [8]:
#1895 unique user interactions
len(df.personId.unique())

1895

In [9]:
# users interacted with 3171 unique artciles
len(df.contentId.unique())

3171

In [10]:
#The different interations the user had with the artciles
df['eventType'].unique()

array(['VIEW', 'FOLLOW', 'BOOKMARK', 'LIKE', 'COMMENT CREATED'],
      dtype=object)

In [11]:
df['eventType']. value_counts(). idxmax()

'VIEW'

In [12]:
df['eventType']. value_counts(). idxmin()

'FOLLOW'

In [13]:
df['eventType']. value_counts()

VIEW               61086
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

**Providing weight to each interaction based on its priority(higher the weight higher the priority)**

In [14]:
eventType_weight= {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

df['eventWeight'] = df['eventType'].apply(lambda x: eventType_weight[x])

In [15]:
df.head(10)

Unnamed: 0,eventType,contentId,personId,eventWeight
0,VIEW,-3.499919e+18,-8845298781299428018,1.0
1,VIEW,8.890721e+18,-1032019229384696495,1.0
2,VIEW,3.105155e+17,-1130272294246983140,1.0
3,FOLLOW,3.105155e+17,344280948527967603,3.0
4,VIEW,-7.820641e+18,-445337111692715325,1.0
5,VIEW,3.105155e+17,-8763398617720485024,1.0
6,VIEW,-8.864073e+18,3609194402293569455,1.0
7,VIEW,-1.492913e+18,4254153380739593270,1.0
8,VIEW,3.105155e+17,344280948527967603,1.0
9,VIEW,3.06437e+18,3609194402293569455,1.0


In [16]:
count_df = df.groupby(['personId', 'contentId']).size().groupby('contentId').size()
count_df

contentId
-9.222795e+18     5
-9.216927e+18    10
-9.194573e+18    16
-9.192549e+18    45
-9.190738e+18     4
                 ..
 9.215261e+18    12
 9.217155e+18     6
 9.220446e+18    26
 9.220446e+18     1
 9.222265e+18     5
Length: 3171, dtype: int64

In [17]:
#count of the each article interation by the user
count_df_content = df.groupby(['personId', 'contentId']).size()
count_df_content

personId              contentId    
-9223121837663643404  -8.949114e+18    1
                      -8.377626e+18    1
                      -8.208801e+18    1
                      -8.187221e+18    1
                      -7.423191e+18    8
                                      ..
 9210530975708218054   8.477804e+18    4
                       8.526043e+18    1
                       8.856169e+18    1
                       8.869348e+18    1
                       9.209886e+18    1
Length: 40823, dtype: int64

In [18]:
#count of how many artciles did the user interact with
count_df = df.groupby(['personId', 'contentId']).size().groupby('personId').size()
count_df.head(10)

personId
-9223121837663643404    43
-9212075797126931087     5
-9207251133131336884     7
-9199575329909162940    11
-9196668942822132778     7
-9188188261933657343     9
-9172914609055320039    24
-9156344805277471150    14
-9150583489352258206     1
-9120685872592674274    24
dtype: int64

In [18]:
#we only want users with interaction above ceratin threshold to avoid cold start problem

In [19]:
#user id with interactions greater than the threshold
interaction_threshold_users=count_df[count_df >= 5].reset_index()[['personId']]
interaction_threshold_users.head(10)

Unnamed: 0,personId
0,-9223121837663643404
1,-9212075797126931087
2,-9207251133131336884
3,-9199575329909162940
4,-9196668942822132778
5,-9188188261933657343
6,-9172914609055320039
7,-9156344805277471150
8,-9120685872592674274
9,-9109785559521267180


In [20]:
#total number of users who have interacted with the articles more than 5 time
len(interaction_threshold_users)

1142

In [21]:
#merging the two dataframe, because we use only the datset with users who have interacted more than 5 times
df_new=df.merge(interaction_threshold_users)

In [22]:
df_new.head(10)

Unnamed: 0,eventType,contentId,personId,eventWeight
0,VIEW,-3.499919e+18,-8845298781299428018,1.0
1,VIEW,-3.499919e+18,-8845298781299428018,1.0
2,VIEW,6.437568e+18,-8845298781299428018,1.0
3,VIEW,-4.76064e+18,-8845298781299428018,1.0
4,VIEW,-6.142463e+18,-8845298781299428018,1.0
5,VIEW,-1.672167e+18,-8845298781299428018,1.0
6,VIEW,-1.672167e+18,-8845298781299428018,1.0
7,VIEW,2.601622e+18,-8845298781299428018,1.0
8,VIEW,2.601622e+18,-8845298781299428018,1.0
9,LIKE,8.890721e+18,-8845298781299428018,2.0


In [23]:
df_new.shape

(69883, 4)

In [24]:
# Using Log transformation, we can fix the skewness of the data

In [24]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
df_log = df_new.groupby(['personId', 'contentId'])['eventWeight'].sum().apply(smooth_user_preference).reset_index()

In [25]:
df_log.head(10)

Unnamed: 0,personId,contentId,eventWeight
0,-9223121837663643404,-8.949114e+18,1.0
1,-9223121837663643404,-8.377626e+18,1.0
2,-9223121837663643404,-8.208801e+18,1.0
3,-9223121837663643404,-8.187221e+18,1.0
4,-9223121837663643404,-7.423191e+18,3.169925
5,-9223121837663643404,-7.331394e+18,1.0
6,-9223121837663643404,-6.872547e+18,1.0
7,-9223121837663643404,-6.728844e+18,1.0
8,-9223121837663643404,-6.59082e+18,1.0
9,-9223121837663643404,-6.558712e+18,1.584963


In [26]:
df_log.shape

(39224, 3)

In [27]:
#spliting the data for training and validation
df_train,df_test=train_test_split(df_log,stratify=df_log['personId'],test_size=0.2)
print("Training data",len(df_train))
print("Testing data",len(df_test))

Training data 31379
Testing data 7845


In [28]:
#pivot the dataset for better representation of large data 
df_pivot=df_train.pivot(index='personId',columns="contentId",values='eventWeight').fillna(0)

In [29]:
df_pivot.head(20)

contentId,-9.222795e+18,-9.216927e+18,-9.194573e+18,-9.192549e+18,-9.190738e+18,-9.189659e+18,-9.189659e+18,-9.176144e+18,-9.171475e+18,-9.166779e+18,...,9.191014e+18,9.207287e+18,9.208127e+18,9.209629e+18,9.209886e+18,9.213261e+18,9.215261e+18,9.217155e+18,9.220446e+18,9.222265e+18
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9188188261933657343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9172914609055320039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9156344805277471150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9120685872592674274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9109785559521267180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
pivot_matrix=df_pivot.values

In [31]:
pivot_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
df_list=list(df_pivot.index)

In [33]:
#create a sparse matrix
sparse_matrix_df=csr_matrix(pivot_matrix)

In [34]:
sparse_matrix_df

<1142x3087 sparse matrix of type '<class 'numpy.float64'>'
	with 31379 stored elements in Compressed Sparse Row format>

In [35]:
sparse_matrix_df.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Matrix Factorization

**Three Factorization approaches have been used for this dataset**
1) Sing Value Decomposition(SVD)

2) Nonnegative Matrix Factorization(NMF)

3) Stochastic Gradient Decent(SGD)

# SVD

In [36]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

np.random.seed(0)
X = csr_matrix(sparse_matrix_df)
svd = TruncatedSVD(n_components=3, n_iter=2, random_state=42)
svd.fit(X)

TruncatedSVD(n_components=3, n_iter=2, random_state=42)

In [37]:
result = svd.transform(X)
print(result)

[[ 1.22159073 -0.4747876  -0.62755091]
 [ 0.06989899 -0.03343466  0.08391489]
 [ 0.29899948 -0.29134444 -0.13380892]
 ...
 [ 0.40080516 -0.34966463 -0.30097234]
 [ 0.45708657 -0.38835364  0.0143537 ]
 [ 1.30176413 -1.76300222  0.28327439]]


In [43]:
result_reconstructed = svd.inverse_transform(result)
print(result_reconstructed)
result_reconstructed.shape

[[ 1.79071434e-02  2.69372761e-03  3.22205622e-02 ...  1.59039000e-02
   2.66877503e-02  5.83929906e-03]
 [-3.52650701e-04  1.83217855e-04  2.49660389e-03 ...  1.40358926e-03
   3.73922861e-03  3.94854865e-04]
 [-4.08397240e-03  1.08260837e-03  1.30076393e-02 ... -2.14399350e-03
   1.10484862e-02  1.76036245e-03]
 ...
 [-2.44313175e-03  1.33873569e-03  1.57799246e-02 ... -2.16185362e-03
   1.17423330e-02  2.23444777e-03]
 [-5.51853632e-03  1.54353250e-03  1.91059596e-02 ...  3.07220119e-05
   1.92988005e-02  2.67575718e-03]
 [-4.89954950e-02  6.00347353e-03  7.42875647e-02 ... -2.15563330e-02
   7.47991120e-02  8.93160068e-03]]


(1142, 3087)

In [45]:
#normalization 
predicted_df_norm = (result_reconstructed - result_reconstructed.min()) / (result_reconstructed.max() - result_reconstructed.min())

In [48]:
predicted_df_norm.shape

(1142, 3087)

In [49]:
#matrix to datframe
pred_df=pd.DataFrame(predicted_df_norm,columns=df_pivot.columns,index=df_pivot.index).transpose()
pred_df

personId,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9.222795e+18,0.186097,0.183252,0.182670,0.185867,0.182364,0.183213,0.185918,0.186089,0.179215,0.187902,...,0.185400,0.177079,0.183142,0.183530,0.174837,0.182060,0.183332,0.182926,0.182446,0.175671
-9.216927e+18,0.183726,0.183335,0.183475,0.183302,0.183412,0.183351,0.183350,0.183371,0.183889,0.183519,...,0.183408,0.184442,0.183361,0.183396,0.184505,0.183412,0.183424,0.183515,0.183547,0.184242
-9.194573e+18,0.188328,0.183696,0.185334,0.183157,0.184584,0.183853,0.183841,0.184383,0.190510,0.185798,...,0.184562,0.197244,0.183965,0.184388,0.198180,0.184624,0.184704,0.185766,0.186284,0.194883
-9.192549e+18,0.187966,0.184288,0.184769,0.182592,0.184276,0.183962,0.184778,0.189486,0.190960,0.185825,...,0.185506,0.197083,0.183865,0.184404,0.199895,0.184647,0.184214,0.184501,0.186768,0.195365
-9.190738e+18,0.183455,0.183296,0.183342,0.183371,0.183317,0.183311,0.183343,0.183243,0.183331,0.183439,...,0.183344,0.183416,0.183317,0.183328,0.183336,0.183303,0.183343,0.183373,0.183325,0.183331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.213261e+18,0.183826,0.183322,0.183488,0.183369,0.183407,0.183349,0.183386,0.183314,0.183837,0.183631,...,0.183437,0.184403,0.183364,0.183407,0.184379,0.183394,0.183445,0.183553,0.183536,0.184143
9.215261e+18,0.187611,0.183661,0.184547,0.183753,0.183961,0.183697,0.184478,0.185511,0.187611,0.186269,...,0.184877,0.191907,0.183732,0.184153,0.192301,0.183962,0.184262,0.184826,0.185334,0.189926
9.217155e+18,0.185785,0.183525,0.182972,0.184826,0.182746,0.183358,0.185467,0.187071,0.181713,0.186778,...,0.185221,0.181017,0.183255,0.183617,0.180269,0.182658,0.183356,0.182970,0.183311,0.179947
9.220446e+18,0.187465,0.183889,0.185028,0.182672,0.184471,0.183859,0.183851,0.185874,0.190541,0.185086,...,0.184573,0.196732,0.183892,0.184279,0.198538,0.184664,0.184404,0.185136,0.186314,0.194963


In [37]:
#The no. of latent factors have been identified using trail and error

No_OF_FACTORS = 45

#SVD decomposes a single matrix to 3 smaller matrices.
U,sigma,Vt = svds(sparse_matrix_df,k=No_OF_FACTORS)

In [38]:
U.shape

(1142, 45)

In [39]:
Vt.shape

(45, 3087)

In [40]:
sigma=np.diag(sigma)
sigma.shape

(45, 45)

In [44]:
#reconstructing the matrix will provide a matrix which is the predicted matrix
predicted_df=np.dot(np.dot(U, sigma),Vt)
predicted_df.shape
print(predicted_df)

NameError: name 'U' is not defined

In [42]:
#normalization 
predicted_df_norm = (predicted_df - predicted_df.min()) / (predicted_df.max() - predicted_df.min())

In [43]:
predicted_df_norm

array([[0.15224894, 0.15153881, 0.14592642, ..., 0.15914075, 0.15204398,
        0.1506596 ],
       [0.15259751, 0.15216355, 0.1511322 , ..., 0.15353096, 0.15191105,
        0.15172064],
       [0.1496238 , 0.15395889, 0.14600975, ..., 0.15316396, 0.1520338 ,
        0.15247668],
       ...,
       [0.14613877, 0.15241711, 0.14947797, ..., 0.14172267, 0.15181403,
        0.15365278],
       [0.15214126, 0.15265674, 0.15228564, ..., 0.15081199, 0.15218411,
        0.1520007 ],
       [0.15598871, 0.1498799 , 0.16999901, ..., 0.14046943, 0.15212212,
        0.15134785]])

In [44]:
#matrix to datframe
pred_df=pd.DataFrame(predicted_df_norm,columns=df_pivot.columns,index=df_pivot.index).transpose()
pred_df

personId,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9.222795e+18,0.152249,0.152598,0.149624,0.153713,0.150722,0.151510,0.147206,0.154915,0.157095,0.151800,...,0.152265,0.151523,0.150684,0.153425,0.154970,0.152034,0.148804,0.146139,0.152141,0.155989
-9.216927e+18,0.151539,0.152164,0.153959,0.151489,0.151907,0.152173,0.151975,0.152353,0.152522,0.150998,...,0.153097,0.153484,0.151503,0.152343,0.159958,0.152102,0.152056,0.152417,0.152657,0.149880
-9.194573e+18,0.145926,0.151132,0.146010,0.152454,0.155751,0.151604,0.158468,0.152683,0.144435,0.158225,...,0.149867,0.171032,0.151572,0.153044,0.195277,0.156411,0.155215,0.149478,0.152286,0.169999
-9.192549e+18,0.153230,0.152664,0.149388,0.150208,0.149269,0.153274,0.154250,0.153991,0.158074,0.150605,...,0.152003,0.196363,0.152043,0.151348,0.161356,0.153334,0.151292,0.156315,0.147610,0.170860
-9.190738e+18,0.152479,0.151756,0.155393,0.152980,0.151844,0.152346,0.152983,0.151548,0.152943,0.151081,...,0.152646,0.149644,0.152416,0.151686,0.151004,0.152087,0.149507,0.154380,0.152243,0.151960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.215261e+18,0.150674,0.151523,0.147646,0.153314,0.151747,0.154438,0.152132,0.147712,0.157619,0.161098,...,0.150189,0.152532,0.151882,0.149796,0.144543,0.151942,0.149363,0.151652,0.149802,0.143728
9.217155e+18,0.150903,0.152777,0.151926,0.151791,0.151873,0.151886,0.150961,0.153848,0.150706,0.151811,...,0.150451,0.145635,0.152007,0.151256,0.153895,0.152083,0.153006,0.149701,0.153763,0.149599
9.220446e+18,0.159141,0.153531,0.153164,0.153304,0.151023,0.154084,0.153476,0.153160,0.143702,0.146224,...,0.157856,0.126428,0.150730,0.153122,0.179446,0.152639,0.153141,0.141723,0.150812,0.140469
9.220446e+18,0.152044,0.151911,0.152034,0.151845,0.151922,0.151925,0.151880,0.151894,0.151470,0.152082,...,0.151959,0.152030,0.151909,0.152001,0.151575,0.151890,0.151986,0.151814,0.152184,0.152122


In [61]:
pred_df.columns

Int64Index([-9223121837663643404, -9212075797126931087, -9207251133131336884,
            -9199575329909162940, -9196668942822132778, -9188188261933657343,
            -9172914609055320039, -9156344805277471150, -9120685872592674274,
            -9109785559521267180,
            ...
             9105269044962898535,  9109075639526981934,  9135582630122950040,
             9137372837662939523,  9148269800512008413,  9165571805999894845,
             9187866633451383747,  9191849144618614467,  9199170757466086545,
             9210530975708218054],
           dtype='int64', name='personId', length=1142)

In [52]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.losses import MeanSquaredLogarithmicError

In [68]:
#Deep model
model=Sequential()
model.add(Dense(64,input_dim=1142,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='linear'))

In [73]:
msle = MeanSquaredLogarithmicError()
model.compile(optimizer=Adam(learning_rate=0.1),loss=msle,metrics=[msle])

In [74]:
model.fit(pred_df,epochs=50,batch_size=10,verbose=0)

ValueError: in user code:

    File "c:\Users\sriku\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\sriku\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\sriku\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\sriku\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1025, in train_step
        self._validate_target_and_loss(y, loss)
    File "c:\Users\sriku\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 979, in _validate_target_and_loss
        raise ValueError(

    ValueError: Target data is missing. Your model was compiled with loss=<keras.losses.MeanSquaredLogarithmicError object at 0x0000025242B2E430>, and therefore expects target data to be provided in `fit()`.


# Recommendation Model for SVD

In [45]:
class CFRecommender:
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=None):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'weight'})

        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)].sort_values('weight', ascending = False).head(topn)

        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                        left_on = 'contentId', 
                                                        right_on = 'contentId')[['weight', 'contentId', 'title','url']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(pred_df, articles_df)


In [46]:
#Recommend articles for the provided user using the recommendation system 
cf_recommender_model.recommend_items(-9016528795238256703, topn=10)

Unnamed: 0,weight,contentId,title,url
0,0.647504,8.657409e+18,Pull request first - Practical Blend,https://medium.com/practical-blend/pull-reques...
1,0.636974,-4.228415e+18,Up your app's sharing game with DirectShare - ...,https://medium.com/google-developers/up-your-a...
2,0.598907,7.534917e+18,The Importance of Code Reviews,https://www.sitepoint.com/the-importance-of-co...
3,0.59437,-4.029705e+18,,
4,0.561652,5.424502e+18,Sam Newman - Backends For Frontends,http://samnewman.io/patterns/architectural/bff/
5,0.535005,5.008914e+18,Post: Just Say mNo to Hungarian Notation,http://jakewharton.com/just-say-no-to-hungaria...
6,0.53334,-8.511291e+18,Dark Scrum,http://ronjeffries.com/articles/016-09ff/defense/
7,0.505031,4.117175e+18,Revolutionize business communication with Kast,http://blog.usekast.com/posts/kast-business-co...
8,0.470269,-6.156752e+18,The Broken Window Theory,https://blog.codinghorror.com/the-broken-windo...
9,0.470017,-4.084395e+18,Android Studio 2.2,http://android-developers.blogspot.com.br/2016...


In [78]:
from surprise.model_selection import cross_validate

In [86]:
from sklearn.metrics import mean_squared_error

rms = mean_squared_error(df_pivot.transpose(), pred_df, squared=False)
rms

0.19316828867423005

In [88]:
import math
rmse = math.sqrt(rms)
rmse

0.43950914515426187

# NMF

In [47]:
nmf_model = NMF(n_components=45)

In [48]:
nmf_model.fit(sparse_matrix_df)



NMF(n_components=45)

In [49]:
Theta = nmf_model.transform(sparse_matrix_df)       
M = nmf_model.components_.T          

# Making the predictions
NMF_pred = M.dot(Theta.T)              
NMF_pred = NMF_pred.T                    

In [50]:
NMF_pred

array([[2.02989317e-02, 2.44574867e-04, 6.98176316e-02, ...,
        3.75857333e-02, 8.60808474e-04, 1.26113280e-03],
       [3.04667249e-03, 1.08990288e-05, 8.23308137e-04, ...,
        1.25293287e-03, 2.86667461e-05, 1.26822865e-04],
       [0.00000000e+00, 2.39539508e-02, 1.73614149e-03, ...,
        4.24156231e-03, 1.31364820e-03, 2.60495859e-03],
       ...,
       [7.32799592e-04, 1.75942870e-02, 0.00000000e+00, ...,
        1.36825828e-02, 8.04850066e-04, 2.03785549e-03],
       [1.22350415e-02, 1.44471199e-02, 7.22562921e-03, ...,
        1.34572236e-02, 1.00435206e-03, 1.74029344e-03],
       [1.38730464e-02, 3.11529891e-04, 2.67740604e-01, ...,
        1.22240218e-02, 8.43690132e-05, 1.66727585e-04]])

In [51]:
NMF_pred_norm=(NMF_pred - NMF_pred.min()) / (NMF_pred.max() - NMF_pred.min())
NMF_pred_norm

array([[2.45849130e-03, 2.96215186e-05, 8.45591495e-03, ...,
        4.55217052e-03, 1.04256233e-04, 1.52741241e-04],
       [3.68995665e-04, 1.32002846e-06, 9.97144048e-05, ...,
        1.51748112e-04, 3.47195345e-06, 1.53600650e-05],
       [0.00000000e+00, 2.90116645e-03, 2.10271596e-04, ...,
        5.13713933e-04, 1.59101608e-04, 3.15497787e-04],
       ...,
       [8.87525240e-05, 2.13092010e-03, 0.00000000e+00, ...,
        1.65715670e-03, 9.74788681e-05, 2.46813482e-04],
       [1.48183872e-03, 1.74975309e-03, 8.75127166e-04, ...,
        1.62986248e-03, 1.21641416e-04, 2.10774456e-04],
       [1.68022458e-03, 3.77307308e-05, 3.24272211e-02, ...,
        1.48050408e-03, 1.02182956e-05, 2.01930980e-05]])

In [52]:
nmf_pred_df=pd.DataFrame(NMF_pred_norm,columns=df_pivot.columns,index=df_pivot.index).transpose()
nmf_pred_df

personId,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9.222795e+18,0.002458,0.000369,0.000000,0.001340,0.000000e+00,0.000020,0.000256,1.530109e-03,0.000992,0.002656,...,0.001799,0.000584,0.000299,6.592389e-04,9.238739e-07,1.622176e-08,0.000163,0.000089,0.001482,0.001680
-9.216927e+18,0.000030,0.000001,0.002901,0.000085,2.738086e-06,0.000225,0.000134,1.327083e-04,0.000096,0.001214,...,0.001468,0.001386,0.000056,5.337144e-05,1.104746e-03,2.089765e-06,0.000574,0.002131,0.001750,0.000038
-9.194573e+18,0.008456,0.000100,0.000210,0.000023,8.328046e-03,0.000134,0.002664,9.115093e-06,0.000031,0.000322,...,0.000153,0.008277,0.000002,8.749196e-04,9.203585e-03,6.699600e-03,0.000412,0.000000,0.000875,0.032427
-9.192549e+18,0.003083,0.000848,0.000243,0.000018,9.114610e-05,0.001349,0.002815,2.967718e-03,0.001110,0.003476,...,0.002095,0.052480,0.000084,6.363233e-04,2.916803e-03,8.898847e-04,0.002621,0.000399,0.001848,0.018665
-9.190738e+18,0.001127,0.000010,0.001061,0.000630,7.161104e-06,0.000093,0.000664,5.587615e-05,0.000052,0.001763,...,0.000258,0.000471,0.000030,1.912360e-05,0.000000e+00,7.345153e-06,0.000035,0.000848,0.000621,0.001631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.215261e+18,0.001255,0.000009,0.000006,0.000542,6.569646e-04,0.001526,0.000045,2.565380e-07,0.002781,0.006194,...,0.000003,0.000815,0.000394,1.398750e-07,3.310363e-03,5.991944e-04,0.000415,0.001385,0.000144,0.002215
9.217155e+18,0.000943,0.000247,0.000000,0.000222,0.000000e+00,0.000236,0.000072,1.338440e-03,0.000051,0.000540,...,0.000033,0.000037,0.000357,2.286442e-06,6.753231e-04,0.000000e+00,0.000160,0.000005,0.000696,0.000002
9.220446e+18,0.004552,0.000152,0.000514,0.001415,1.066739e-05,0.001275,0.001199,7.556088e-04,0.003476,0.004793,...,0.001599,0.002338,0.000132,1.545923e-04,7.758749e-03,2.180781e-04,0.000961,0.001657,0.001630,0.001481
9.220446e+18,0.000104,0.000003,0.000159,0.000007,1.867115e-08,0.000009,0.000009,6.138827e-06,0.000022,0.000108,...,0.000013,0.000138,0.000003,3.888915e-06,1.422865e-04,4.410011e-06,0.000009,0.000097,0.000122,0.000010


In [53]:
class NMFRecommender:
    
    def __init__(self, nmf_predictions_df, items_df=None):
        self.nmf_predictions_df = nmf_predictions_df
        self.items_df = items_df
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=None):
        # Get and sort the user's predictions
        sorted_user_predictions = self.nmf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'weight'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)].sort_values('weight', ascending = False).head(topn)

        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                        left_on = 'contentId', 
                                                        right_on = 'contentId')[['weight', 'contentId', 'title','url']]


        return recommendations_df
    
nmf_recommender_model = NMFRecommender(nmf_pred_df, articles_df)


In [54]:
nmf_recommender_model.recommend_items(-9016528795238256703, topn=10)

Unnamed: 0,weight,contentId,title,url
0,0.552759,-4.228415e+18,Up your app's sharing game with DirectShare - ...,https://medium.com/google-developers/up-your-a...
1,0.526635,8.657409e+18,Pull request first - Practical Blend,https://medium.com/practical-blend/pull-reques...
2,0.517305,7.534917e+18,The Importance of Code Reviews,https://www.sitepoint.com/the-importance-of-co...
3,0.512872,5.424502e+18,Sam Newman - Backends For Frontends,http://samnewman.io/patterns/architectural/bff/
4,0.468531,-8.511291e+18,Dark Scrum,http://ronjeffries.com/articles/016-09ff/defense/
5,0.464776,5.008914e+18,Post: Just Say mNo to Hungarian Notation,http://jakewharton.com/just-say-no-to-hungaria...
6,0.460577,4.117175e+18,Revolutionize business communication with Kast,http://blog.usekast.com/posts/kast-business-co...
7,0.387523,-4.084395e+18,Android Studio 2.2,http://android-developers.blogspot.com.br/2016...
8,0.378881,-8.728075e+18,Soylent's new drink will replace your breakfas...,http://www.theverge.com/2016/8/9/12414566/soyl...
9,0.374094,-7.555896e+18,ConstraintLayout - Part 1,https://blog.stylingandroid.com/constraintlayo...


In [89]:
rms = mean_squared_error(df_pivot.transpose(), nmf_pred_df, squared=False)
rmse = math.sqrt(rms)
rmse

0.3420667157632695

# SGD

In [55]:
#check how sparse our data is 
sparsity = float(len(sparse_matrix_df.nonzero()[0]))
sparsity /= (sparse_matrix_df.shape[0] * sparse_matrix_df.shape[1])
sparsity *= 100
print('{:.2f}%'.format(sparsity))

0.89%


In [56]:
def train_test_split(eventWeight):
    train=eventWeight.copy()
    
    
    return train


In [57]:
train = train_test_split(sparse_matrix_df)

In [58]:
pred_df

personId,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9.222795e+18,0.152249,0.152598,0.149624,0.153713,0.150722,0.151510,0.147206,0.154915,0.157095,0.151800,...,0.152265,0.151523,0.150684,0.153425,0.154970,0.152034,0.148804,0.146139,0.152141,0.155989
-9.216927e+18,0.151539,0.152164,0.153959,0.151489,0.151907,0.152173,0.151975,0.152353,0.152522,0.150998,...,0.153097,0.153484,0.151503,0.152343,0.159958,0.152102,0.152056,0.152417,0.152657,0.149880
-9.194573e+18,0.145926,0.151132,0.146010,0.152454,0.155751,0.151604,0.158468,0.152683,0.144435,0.158225,...,0.149867,0.171032,0.151572,0.153044,0.195277,0.156411,0.155215,0.149478,0.152286,0.169999
-9.192549e+18,0.153230,0.152664,0.149388,0.150208,0.149269,0.153274,0.154250,0.153991,0.158074,0.150605,...,0.152003,0.196363,0.152043,0.151348,0.161356,0.153334,0.151292,0.156315,0.147610,0.170860
-9.190738e+18,0.152479,0.151756,0.155393,0.152980,0.151844,0.152346,0.152983,0.151548,0.152943,0.151081,...,0.152646,0.149644,0.152416,0.151686,0.151004,0.152087,0.149507,0.154380,0.152243,0.151960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.215261e+18,0.150674,0.151523,0.147646,0.153314,0.151747,0.154438,0.152132,0.147712,0.157619,0.161098,...,0.150189,0.152532,0.151882,0.149796,0.144543,0.151942,0.149363,0.151652,0.149802,0.143728
9.217155e+18,0.150903,0.152777,0.151926,0.151791,0.151873,0.151886,0.150961,0.153848,0.150706,0.151811,...,0.150451,0.145635,0.152007,0.151256,0.153895,0.152083,0.153006,0.149701,0.153763,0.149599
9.220446e+18,0.159141,0.153531,0.153164,0.153304,0.151023,0.154084,0.153476,0.153160,0.143702,0.146224,...,0.157856,0.126428,0.150730,0.153122,0.179446,0.152639,0.153141,0.141723,0.150812,0.140469
9.220446e+18,0.152044,0.151911,0.152034,0.151845,0.151922,0.151925,0.151880,0.151894,0.151470,0.152082,...,0.151959,0.152030,0.151909,0.152001,0.151575,0.151890,0.151986,0.151814,0.152184,0.152122


In [59]:
def predictions(P,Q):
    return np.dot(P.T,Q)

In [60]:
lmbda = 0.4 # Regularization parameter
k = 4 #Factor parameter
m, n = train.shape  # Number of users and items

n_epochs = 100  # Number of epochs
alpha=0.01  # Learning rate

P = 3 * np.random.rand(k,m) # Latent user feature matrix
Q = 3 * np.random.rand(k,n) # Latent movie feature matrix

In [61]:
train.shape

(1142, 3087)

In [62]:


#Function to find root-mean-square-error
def rmse(predictions, ground_truth):
    predictions = predictions[ground_truth.nonzero()].flatten()   #Predicted values
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()  #Original values
    return sqrt(mean_squared_error(predictions, ground_truth))


In [63]:
users, items = train.nonzero()
for u, i in zip(users, items):
    error = train[u, i] - predictions(P[:,u],Q[:,i])
    P[:, u] += alpha * (error * Q[:, i] - lmbda * P[:, u])
    Q[:, i] += alpha * (error * P[:, u] - lmbda * Q[:, i])
        


In [64]:
SGD_prediction=predictions(P,Q) 
SGD_prediction

array([[-1.42634802,  0.2187594 ,  1.87414639, ...,  0.67552259,
         2.27041456,  2.62544732],
       [ 3.6976778 ,  2.95260781,  0.57646901, ...,  3.87677402,
         3.52001275,  4.74848944],
       [ 2.26622129,  2.62121356,  0.66057719, ...,  2.87004323,
         3.56442806,  4.15900035],
       ...,
       [ 3.03423752,  3.26200663,  2.82742092, ...,  5.34367758,
         6.04565058,  8.21793623],
       [ 3.46092183,  6.64136618,  0.29718354, ...,  9.72747406,
         4.5372478 ,  9.00974335],
       [ 0.40085037,  1.74560376, -0.36210893, ...,  1.73061142,
         1.07921922,  1.54662712]])

In [65]:
SGD_pred_norm=(SGD_prediction - SGD_prediction.min()) / (SGD_prediction.max() - SGD_prediction.min())

In [66]:
sgd_pred_df=pd.DataFrame(SGD_pred_norm,columns=pred_df.index,index=pred_df.columns).transpose()

In [67]:
sgd_pred_df

personId,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9.222795e+18,0.076205,0.258820,0.207805,0.110497,0.215036,0.202763,0.137377,0.196760,0.253708,0.084369,...,0.313076,0.255294,0.241201,0.120808,0.217366,0.250012,0.146500,0.235176,0.250383,0.141325
-9.216927e+18,0.134835,0.232267,0.220456,0.300327,0.348716,0.191657,0.120951,0.125004,0.179883,0.162403,...,0.146843,0.165654,0.175655,0.245669,0.172082,0.397518,0.217807,0.243294,0.363731,0.189250
-9.194573e+18,0.193832,0.147584,0.150581,0.106036,0.214317,0.139729,0.201358,0.190184,0.179232,0.178557,...,0.152202,0.198638,0.232749,0.220857,0.151925,0.139067,0.194581,0.227805,0.137630,0.114134
-9.192549e+18,0.159379,0.145810,0.148251,0.141596,0.190516,0.134861,0.158708,0.148014,0.153770,0.153582,...,0.133375,0.159432,0.171721,0.185890,0.137569,0.163682,0.166323,0.182679,0.155336,0.129367
-9.190738e+18,0.065381,0.306196,0.250442,0.210689,0.309195,0.229487,0.114877,0.174403,0.265293,0.090518,...,0.310119,0.253554,0.233784,0.162629,0.230118,0.389211,0.176735,0.268522,0.369502,0.178035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.215261e+18,0.178628,0.216605,0.212758,0.228793,0.334878,0.168120,0.176110,0.157418,0.206533,0.177884,...,0.159552,0.207425,0.221806,0.278277,0.165543,0.318987,0.229645,0.277459,0.283967,0.160876
9.217155e+18,0.248557,0.191084,0.185565,0.268526,0.456717,0.259907,0.218349,0.278489,0.168261,0.307132,...,0.118735,0.202789,0.372606,0.340704,0.241071,0.358522,0.344168,0.342542,0.398156,0.157269
9.220446e+18,0.151114,0.265203,0.229324,0.304275,0.442260,0.280686,0.148130,0.232733,0.204291,0.220486,...,0.202691,0.212826,0.315481,0.265035,0.257987,0.454516,0.288553,0.317482,0.473717,0.188716
9.220446e+18,0.207954,0.252489,0.254072,0.239090,0.393919,0.148624,0.212401,0.149305,0.259465,0.178081,...,0.179660,0.258182,0.240770,0.351516,0.155496,0.361892,0.253862,0.342500,0.288742,0.165501


In [68]:
class SGDRecommender:
    
    def __init__(self, sgd_predictions_df, items_df=None):
        self.sgd_predictions_df = sgd_predictions_df
        self.items_df = items_df
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=None):
        # Get and sort the user's predictions
        sorted_user_predictions = self.sgd_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'weight'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)].sort_values('weight', ascending = False).head(topn)

        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                        left_on = 'contentId', 
                                                        right_on = 'contentId')[['weight', 'contentId', 'title','url']]


        return recommendations_df
    
sgd_recommender_model = SGDRecommender(sgd_pred_df, articles_df)


In [69]:
sgd_recommender_model.recommend_items(-9016528795238256703, topn=10)

Unnamed: 0,weight,contentId,title,url
0,0.196536,5.135372e+18,Dries Buytaert,http://buytaert.net/acquia-retrospective-2016
1,0.195837,6.800966e+18,Sponsored: Five reasons why retailers should o...,https://www.retail-week.com/analysis/retail-vo...
2,0.195637,7.727369e+18,Demystifying Docker overlay networking,http://blog.nigelpoulton.com/demystifying-dock...
3,0.193428,5.701874e+18,,
4,0.192882,-8.676747e+16,Automação de Marketing: o seu negócio está pro...,https://www.linkedin.com/pulse/automa%C3%A7%C3...
5,0.192176,7.790568e+18,US Federal Judge Rules Bitcoin is Money in Coi...,http://www.coindesk.com/us-judge-bitcoin-money...
6,0.191772,1.729893e+18,5 motivos para não escrever 5 motivos,https://medium.com/@vilucas/5-motivos-para-n%C...
7,0.191424,-6.19296e+18,Google's official Project Wing logo revealed,http://phandroid.com/2016/04/13/googles-projec...
8,0.191421,-4.866987e+18,Endered e Small World fazeme parceria para rem...,http://convergecom.com.br/tiinside/webinside/e...
9,0.191343,1.6533e+18,A estratégia de vendas da Amazon,https://endeavor.org.br/estrategia-de-vendas-d...


In [91]:
rms = mean_squared_error(df_pivot.transpose(), sgd_pred_df, squared=False)
rmse = math.sqrt(rms)
rmse

0.5531337814463596

# Conclusion

On comparing all the factorization approaches, It has been observed that SVD provides a better recommendation

In [70]:
df[df['personId']==6680762707204337829]

Unnamed: 0,eventType,contentId,personId,eventWeight
26881,VIEW,1.317782e+18,6680762707204337829,1.0
26882,VIEW,1.317782e+18,6680762707204337829,1.0
34919,LIKE,-9.176144e+18,6680762707204337829,2.0
34922,BOOKMARK,-9.176144e+18,6680762707204337829,2.5
34925,BOOKMARK,-9.176144e+18,6680762707204337829,2.5
34926,VIEW,-9.176144e+18,6680762707204337829,1.0
34928,VIEW,-9.176144e+18,6680762707204337829,1.0
34932,LIKE,-9.176144e+18,6680762707204337829,2.0
60939,VIEW,3.293247e+18,6680762707204337829,1.0


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   eventType    72312 non-null  object 
 1   contentId    72312 non-null  float64
 2   personId     72312 non-null  int64  
 3   eventWeight  72312 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 2.2+ MB


In [72]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 5, show = True):
    
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items), item_features=books_metadata_csr))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    