In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from scipy.stats.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

** All code here is based on the models that were done in class in Excel**

## Collaborative Filtering with Pearson Similarity
Below, we loaded the data and computed the average user ratings matrix, average item ratings matrix as well as the pearson similarity matrix for users.

In [2]:
df = pd.read_excel("movie_ratings_inclass.xlsx", sheetname=0, index_col=1, header = 2)
df2 = df.drop(['User','Avg'], axis=1).iloc[0:20]
avguserrating = df2.mean(axis=1).values
avgitemrating = df2.mean(axis=0).values
df3 = df2.T.convert_objects(convert_numeric=True)
pearsonsimilarities = df3.corr()

The following code replaces the diagonal of the pearson similarity matrix with 0's and filled any NA's in the matrix with 0's. Then, a list is created where for each user, we sort the list of nearest neighbors where the first item is its nearest neighbour. Then, we shifted its own index to the back of this list so that we do not take the user as its own neighbour ever.

In [3]:
for i in pearsonsimilarities.index:
    pearsonsimilarities.loc[i, i] = 0.0
pearsonsimilarities = pearsonsimilarities.fillna(0)

order = np.argsort(-pearsonsimilarities.values, axis=1)
# Moving its own index to the back so that it would not be its own nearest neighbor
for index, element in enumerate(order):
    element = element[element != index]
    element = np.append(element, index)
    order[index] = element

# Changing pandas dataframes to numpy to make it easier
pearsonsimilaritiesnp = pearsonsimilarities.values
df2np = df2.values

 This gets the imputed matrix of predicted ratings. The next code finds the k nearest neighbours of those who rated the movie and bases the predicted ratings on their average weighted rating. Example: user 1 has users 5,7, 8 as its 3 nearest neighbours by that order. If I predict ratings of user 1 for movie 6 based on the 2 nearest neighbors, and only user 5 and 8 has rated that movie, I consider users 5 and 8's ratings even though user 8 is in reality, the third nearest neighbor overall for user 1.  
Again, the following code will return both the prediction matrix and the sum of square errors between the values in prediction matrix minus the values of actual ratings for those elements that were actually rated. We will then loop it for 1-20 k nearest neighbors to find the optimal k

In [8]:
def userating1(k):
    imputed = np.zeros((20,20))
    for a in range(0,20):
        for i in range (0,20):
            list2 = []
            list3 = []
            count1 = 0
            count2 = 0
            while (count1 < 20):
                m = order[a][count1]
                c = pearsonsimilaritiesnp[a][m]
                d = df2np[m][i]
                if np.isnan(d) == False:
                    # list 3 appends all the absolute weighted similarities
                    list3.append(abs(pearsonsimilaritiesnp[a][m]))
                    # list 2 appends the weighted similarity*(user rating for movie i- user's avg rating)
                    pain1 = c*(d-avguserrating[m])
                    list2.append(pain1)
                    count2 +=1
                count1+= 1
                # If we get the correct amount of nearest neighbors, this stops the loop
                if count2 == k:
                    break
            # This gives the prediction for user a for movie i
            score = sum(list2)/sum(list3)
            imputed[a][i] = avguserrating[a] + score
    # Calculating Errors
    a = imputed-df2
    b = a.multiply(a, fill_value  = 0)
    SSE = np.nansum(b.values)
    return(pd.DataFrame(imputed), SSE)

In [7]:
sselist1 = []
for i in range(1,21):
    a = userating1(i)[1]
    sselist1.append(a)
knnnumber = range(1,21)
ssedata = pd.DataFrame(
    {'knn': knnnumber,
     'sse': sselist1,
    }, columns=['knn','sse'])
ssedata

Unnamed: 0,knn,sse
0,1,358.924667
1,2,255.968404
2,3,236.974243
3,4,228.657748
4,5,224.959389
5,6,229.24979
6,7,224.633608
7,8,217.034198
8,9,209.69051
9,10,197.257359


Looking at this, it seems that the optimal k for knn is 15, which we will use.

### Predictions

In [9]:
predictionmatrix = userating1(15)[0]
predictions = pd.DataFrame(predictionmatrix)
# Had to plus 1 each way since user and movies start from index 1
predictions.index = predictions.index + 1
predictions.columns = predictions.columns + 1
predictions

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
1,2.125887,3.809875,2.468465,2.997126,3.664587,3.102324,2.656372,2.673367,2.317369,3.521447,3.044063,2.934637,4.040158,2.998709,2.57367,4.267142,1.761662,4.332319,3.593288,2.319217
2,3.988588,2.915932,3.115639,2.133721,3.057883,2.432871,2.117643,2.580919,3.000151,3.63267,3.096949,2.527106,2.173434,2.532439,2.473829,3.200056,3.763495,1.881988,2.68988,1.752868
3,1.912954,2.094736,2.369515,3.683575,1.987374,2.466174,3.285082,1.978337,3.130212,1.539215,3.272523,2.279263,2.603694,2.982542,2.18674,1.321762,2.126969,2.364964,1.98824,3.365319
4,3.32694,2.339844,2.388143,2.70054,2.600596,1.627711,2.937618,2.793358,3.117819,2.315239,2.864993,3.63611,1.807446,2.924626,3.392369,2.389743,3.752023,1.487925,2.007544,3.683318
5,3.578885,2.5014,3.047309,2.395425,3.392122,2.622037,2.488853,3.083817,3.72331,2.617868,3.964756,3.011088,3.098395,3.409873,3.955804,2.317867,3.497197,3.571934,3.677112,3.248757
6,2.901324,4.029951,3.009891,3.663975,3.313767,3.323299,3.158363,2.218895,2.993365,4.031558,3.873461,2.121943,4.000037,2.838593,1.655562,3.619244,2.748993,3.299288,2.454697,2.453458
7,2.567399,1.529158,3.11306,2.17578,2.513765,3.18591,2.764587,3.567647,2.323389,2.179307,1.930943,2.658001,2.259856,2.175,2.778515,1.777193,2.325736,2.792869,3.540116,2.824637
8,3.514328,3.031754,1.988014,1.799454,2.775932,2.795472,2.537641,3.030631,2.282562,2.746412,2.828054,2.144753,2.837773,2.326532,3.466282,3.619402,1.919753,3.713291,4.156083,2.86003
9,3.65076,1.960428,3.351168,3.515065,2.621152,3.134144,3.872102,3.731547,3.544526,2.412741,2.995182,3.572298,2.399526,3.780511,4.530988,1.999423,3.485569,2.459768,3.110284,4.022902
10,3.895273,2.232604,3.870131,4.047578,2.657119,3.249615,3.680903,3.436299,4.137508,3.085499,3.17607,3.475341,2.360364,3.587308,3.287793,2.135742,4.102399,1.827758,2.260609,3.852409


## Gradient Descent

The code below is tuning for optimal lambda and gamma and the number of iterations. Errors were calculated for each iteration and then the minimum error and its corresponding number of iterations was found for each lambda, gamma pair. Afterwards, we find the optimal lambda, gamma pair that gives the lowest error.  

It works by initializing the latent factor matrices with ones. Then, for each element in the original matrix, it updates the movie matrix and then calculates the squared error between the actual and prediction. Then we update the latent factor matrix given the error, lambda (regularization term) and gamma (value determining the rate of approaching the minimum or learning rate). If gamma is large, we might skip the optimal solution and end up oscillating around it. If it is too small, we might need too many iterations to converge to the optimal value.

In [None]:
tuningDF = pd.DataFrame({'lambda': [0], 'gamma': [0], 
                         'train_error_index': [0], 'train_error': [0]})
lambdav = np.arange(0, 10, 0.01)
gamma= np.arange(0, 0.6, 0.01)
#lambdav = np.arange(0, 20, 0.1)
#gamma= np.arange(0, 1, 0.2)
for lam in lambdav:
    for gam in gamma:
        df4 = df2
        df4 = df4.fillna(0)
        # Changing pandas dataframes to numpy to make it easier
        df4 = df4.values
        df5 = df4.copy()
        def sse(true):
            return np.sum(true**2)
        P = np.ones((20, 2))
        Q = np.ones((2, 20))
        
        train_errors = []
        num = 0
        while num < 60:
            S = P.copy()
            R = Q.copy()
            errorlist = []
            for a in range(0,20):
                for b in range (0,20):
                    if df5[a,b] != 0:
                        df4[a,b] = df5[a,b] - np.dot(R[:,b], S[a, :].T)
                        errorlist.append((df4[a,b])**2)
                    else:
                        df4[a,b] = 0
            
            for c in range(0,20):
                P[c, 0] += gam * (np.dot(df4[c, :], R[0,:]) - lam * P[c,0])
                P[c, 1] += gam * (np.dot(df4[c, :], R[1,:]) - lam * P[c,1])
                Q[0, c] += gam * (np.dot(df4[:, c], S[:,0]) - lam * Q[0, c]) 
                Q[1, c] += gam * (np.dot(df4[:, c], S[:,1]) - lam * Q[1, c]) 
            num += 1
            train_errors.append(sum(errorlist))
        minerrorindex = train_errors.index(min(train_errors))
        minerror = min(train_errors)
        var_2 = pd.DataFrame({'lambda': [lam], 'gamma': [gam], 
                              'train_error_index': [minerrorindex], 'train_error': [minerror]})
        tuningDF = pd.concat([tuningDF, var_2], ignore_index=True)

In [222]:
p = tuningDF.loc[tuningDF.train_error>0, ]
p[p['train_error'] == min(p['train_error'])]

Unnamed: 0,gamma,lambda,train_error,train_error_index
3,0.02,0.0,420.873906,59


Running with optimal values of lambda, gamma and number of iterations to get the predictions

In [247]:
df4 = df2
df4 = df4.fillna(0)
# Changing pandas dataframes to numpy to make it easier
df4 = df4.values
df5 = df4.copy()
def sse(true):
    return np.sum(true**2)
        
lambdav = 0
gamma=0.02
# Latent factors for users
P = np.ones((20, 2))
# Latent factors for movie
Q = np.ones((2, 20))



train_errors = []
num = 0
while num < 60:
    S = P.copy()
    R = Q.copy()
    errorlist = []
    for a in range(0,20):
        for b in range (0,20):
            if df5[a,b] != 0:
                df4[a,b] = df5[a,b] - np.dot(R[:,b], S[a, :].T)
                errorlist.append((df4[a,b])**2)
            else:
                df4[a,b] = 0
                
    for c in range(0,20):
        np.dot(df4[0, :], R[0,:])
        P[c, 0] += gamma * (np.dot(df4[c, :], R[0,:]) - lambdav * P[c,0])
        P[c, 1] += gamma * (np.dot(df4[c, :], R[1,:]) - lambdav * P[c,1])
        Q[0, c] += gamma * (np.dot(df4[:, c], S[:,0]) - lambdav * Q[0, c]) 
        Q[1, c] += gamma * (np.dot(df4[:, c], S[:,1]) - lambdav * Q[1, c]) 
    num += 1
    train_errors.append(sum(errorlist))

**Prediction Matrix**

In [253]:
predictions = np.dot(P, Q)
predictions = pd.DataFrame(predictions)
predictions.index = predictions.index + 1
predictions.columns = predictions.columns + 1
predictions

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
1,3.237772,2.98597,2.828368,3.636431,3.187525,3.028465,2.924947,3.033753,3.208679,2.710539,3.088433,1.943864,2.831428,3.457131,3.243509,3.05701,3.136673,2.95721,3.050227,3.19564
2,2.99643,2.763397,2.617542,3.365373,2.949928,2.802724,2.706922,2.807618,2.969505,2.508497,2.858222,1.798969,2.620375,3.199437,3.001739,2.829142,2.902866,2.736781,2.822864,2.957438
3,2.679833,2.471421,2.340978,3.009794,2.638244,2.506594,2.420914,2.510971,2.655753,2.243454,2.556228,1.608893,2.343511,2.86139,2.68458,2.53022,2.596155,2.447617,2.524605,2.64496
4,2.870047,2.646843,2.50714,3.223429,2.825507,2.684512,2.59275,2.689199,2.844258,2.402694,2.737669,1.723092,2.509853,3.064492,2.875132,2.709815,2.78043,2.621349,2.703802,2.832699
5,3.453329,3.184762,3.016668,3.878528,3.399736,3.230087,3.119676,3.235727,3.422299,2.890995,3.294047,2.073277,3.019932,3.687291,3.459447,3.260532,3.345498,3.154088,3.253297,3.408391
6,3.291136,3.035183,2.874984,3.696365,3.24006,3.078379,2.973154,3.083754,3.261563,2.755213,3.139335,1.975901,2.878095,3.514109,3.296967,3.107395,3.18837,3.00595,3.100499,3.248309
7,2.556303,2.357498,2.233068,2.871054,2.516631,2.391049,2.309319,2.395225,2.533333,2.140039,2.438396,1.534729,2.235484,2.729492,2.560832,2.413587,2.476482,2.334792,2.408231,2.523038
8,3.026098,2.790757,2.643459,3.398693,2.979135,2.830474,2.733723,2.835417,2.998906,2.533333,2.886522,1.81678,2.646319,3.231115,3.031459,2.857153,2.931608,2.763878,2.850813,2.986719
9,3.418873,3.152986,2.986569,3.83983,3.365815,3.197858,3.08855,3.203443,3.388153,2.86215,3.261181,2.052591,2.989801,3.650501,3.42493,3.228,3.312119,3.122618,3.220837,3.374384
10,3.35715,3.096063,2.932651,3.770507,3.30505,3.140125,3.03279,3.145609,3.326984,2.810478,3.202304,2.015534,2.935824,3.584596,3.363098,3.169723,3.252323,3.066243,3.162689,3.313463
