In [None]:
# List Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from timeit import default_timer


In [None]:
df_movies = pd.read_csv('../data/ml-20m/movies.csv')
df_movies.head()

In [None]:
df_movies.info()

In [None]:
df_ratings = pd.read_csv('../data/ml-20m/ratings.csv',sep=',',
                         names= ['userId','movieId','rating','timestamp'],
                         dtype= {'userId':np.int32, 'movieId':np.int32, 'rating':np.float64, 'timestamp':np.int64}, 
                         header=0, skipinitialspace=True, error_bad_lines=False, engine='python')
df_ratings.head()

In [None]:
df_ratings.info()

In [None]:
# How does the no.of ratings per user vary?

CntRatingsPerUser = df_ratings[['userId', 'movieId']].groupby('userId').count()
CntRatingsPerUser.columns=['no_ratings']

plt.scatter(CntRatingsPerUser.index, CntRatingsPerUser.no_ratings, edgecolor='black')
plt.xlim(0,len(CntRatingsPerUser.index))
plt.ylim(0,)
plt.title('No.Of.Ratings')
plt.xlabel('userId')
plt.ylabel('Number of ratings given')
plt.show()

In [None]:
# Create train, test and validation sets
df_train_ratings, df_test_ratings = train_test_split(df_ratings, test_size=0.2, 
                                                     random_state=1, stratify=df_ratings['userId'])
df_train_ratings, df_validation_ratings = train_test_split(df_train_ratings, test_size=0.2, 
                                                           random_state=1, stratify=df_train_ratings['userId'])
df_train_ratings.info()
df_test_ratings.info()
df_validation_ratings.info()

In [None]:
# Create ratings sparse matrix

lst_unique_users = list(sorted(df_train_ratings.userId.unique()))
lst_unique_movies = list(sorted(df_train_ratings.movieId.unique()))

# Encode to factors
encodedUserId = df_train_ratings.userId.astype(pd.api.types.CategoricalDtype(categories = lst_unique_users)).cat.codes
encodedMovieId = df_train_ratings.movieId.astype(pd.api.types.CategoricalDtype(categories = lst_unique_movies)).cat.codes

lst_ratings = df_train_ratings['rating'].tolist()

mat_ratings = csr_matrix((lst_ratings, (encodedMovieId, encodedUserId)), shape=(len(lst_unique_movies), len(lst_unique_users)))
mat_ratings.shape

In [None]:
# Remove User Bias

mat_ratings = normalize(mat_ratings, norm='l1', copy=False, axis=0)
sum(mat_ratings[:,0].toarray()[:,0])

In [None]:
# calculate similarity matrix

mat_movie_similarity = cosine_similarity(mat_ratings, dense_output=False)
mat_movie_similarity.shape

In [None]:
# Find k most silimar for an item
kSimilar={}
def fnKSimilar(pMovieIdx, pk):
    if (pMovieIdx in kSimilar):
        return kSimilar[pMovieIdx]
    pMovieRow = mat_movie_similarity.getrow(pMovieIdx)
    vSortedMovieTuples = sorted(zip(pMovieRow.indices, pMovieRow.data), key=lambda x: (x[1]), reverse=True)
    vKSimilar = [i for i,j in vSortedMovieTuples[1:pk+1]]
    kSimilar[pMovieIdx] = vKSimilar
    return vKSimilar

vFnKSimilar = np.vectorize(fnKSimilar, doc='Vectorized function to return k similar movie tuples')

In [93]:
# Calculate the user pool
k = 30
C = {}

#print(fnKSimilar(mat_movie_similarity.getrow(0), k))
vUserIdx = pd.Series([i for i in range(0,mat_ratings.shape[1])])

def fnCalcUserPool(pUserIdx):
    if (pUserIdx%1000==0):
        print(pUserIdx)
    vUserCol = pd.Series(mat_ratings.getcol(pUserIdx).toarray()[:,0],dtype='float64',index= [i for i in range(0,mat_ratings.shape[0])])
    vUserCol = vUserCol[vUserCol>0]
    vUserCol = vUserCol.index.to_series()
    vSimElements = vUserCol.apply(vFnKSimilar, args=(k,))
    
    def fnFindUnique(pVec):
        if(pUserIdx not in C):
            C[pUserIdx] = pVec
        else: 
            C[pUserIdx] = pd.Series(np.append(C[pUserIdx], np.setdiff1d(pVec, C[pUserIdx])))
    vfnFindUnique = np.vectorize(fnFindUnique)
    vSimElements.apply(vfnFindUnique)
    return C[pUserIdx]
    
#start_time = default_timer()
#g = vUserIdx.apply(fnCalcUserPool)
#end_time = default_timer() - start_time
vfnCalcUserPool = np.vectorize(fnCalcUserPool)
a = pd.Series([i for i in range(0,2)]).apply(fnCalcUserPool)
#a = vfnCalcUserPool([0,1])

0


In [119]:
# Predict rating for ui, mj

def fnPredictSimilarity(pMovieId, pUserId):
    pUserIdx = pUserId - 1
    pMovieIdx = pMovieId - 1
    vUserCol = pd.Series(mat_ratings.getcol(pUserIdx).toarray()[:,0],dtype='float64',index= [i for i in range(0,mat_ratings.shape[0])])
    vUserCol = vUserCol[vUserCol>0]
    vSimilarity = mat_movie_similarity[pMovieIdx,vUserCol.index.tolist()]
    vSortedSimilarity = sorted(zip(vSimilarity.indices, vSimilarity.data), key=lambda x: (x[1]), reverse=True)
    vSortedKSimilarity = [j for i,j in vSortedSimilarity[1:k+1]]
    return np.sum(vSortedKSimilarity)
    

# Predict TopN for given user
N = 10
def fnTopN(pUserId):
    pUserIdx = pUserId - 1;
    if(pUserIdx not in C):
        vUserPool = pd.Series([pUserIdx]).apply(fnCalcUserPool)
    else: vUserPool = C[pUserIdx]
    vfnPredictSimilarity = np.vectorize(fnPredictSimilarity)
    vPoolSimilarity = (vUserPool+1).apply(vfnPredictSimilarity, args=(pUserId,))
    vSortedPoolSimilarity = vPoolSimilarity.sort_values(ascending=False)
    return vSortedPoolSimilarity[0:N].index+1

In [120]:
print(fnPredictSimilarity(1199,1))
predUser = fnTopN(1)

1.1179831000238813


In [None]:
df_train_ratings[df_train_ratings['userId']==1]

In [77]:
type(a)

pandas.core.series.Series

In [92]:
C

{0: array([ 496,  360,  314, ..., 2052, 9278, 2374]),
 1: array([  724,   767,   139,   640,     2,     4,  1052,    93,     6,
           35,   490,   721,   773,   775,    16,    24,     0,   789,
          696,   372,   138,   792,   645,     5,   602,    31,   102,
           13,    78,   621,   299,   442,   211,   413,   251,   303,
          318,   304,   305,   465,   343,    34,   306,   269,   505,
          615,   212,   533,  3961,   229,   388,   448,   152,   197,
          464,   296,   262,   497,   246,   244,  1184,  1171,  1326,
         1173,   843,  2486,    61,  1362,  1212,   352,   453,   583,
          373,   586,   584,   148,   376,   450,   360,   496,   587,
          163,   582,   108,   151,   345,   363,   325,   340,   313,
          293,   589,   591,   581,   289,   430,   183,     9,   206,
         1188,  1175,   907,  1174,   737,  1180,   257,  1242,  1113,
         1564,  1182,  1684,  1224,  1067,  1075,  1926,  1480,  1169,
         2830,  1263

In [121]:
predUser

Int64Index([100, 105, 101, 108, 50, 131, 45, 137, 70, 259], dtype='int64')

In [107]:
C

{0: 0         496
 1         360
 2         314
 3         363
 4         580
 5         581
 6         582
 7         476
 8          10
 9         589
 10          9
 11        535
 12        148
 13        591
 14        373
 15        346
 16        206
 17        586
 18        158
 19         33
 20        313
 21        156
 22        376
 23        151
 24        584
 25        406
 26        352
 27        325
 28        340
 29         20
         ...  
 1609    12100
 1610    10491
 1611     7751
 1612     8578
 1613     7223
 1614     8111
 1615    11762
 1616     7265
 1617    11584
 1618    13255
 1619    11020
 1620     6843
 1621    12338
 1622    19784
 1623    13325
 1624     9826
 1625     7040
 1626     7206
 1627    22525
 1628    11393
 1629    11141
 1630     6944
 1631     7851
 1632     8520
 1633     7769
 1634    18229
 1635     3912
 1636     2052
 1637     9278
 1638     2374
 Length: 1639, dtype: int32, 1: 0        724
 1        767
 2        139
 3       