In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
ratings = pd.DataFrame.from_csv('Small/ratings.csv',index_col=None)
movies = pd.DataFrame.from_csv('Small/movies.csv',index_col='movieId')

In [3]:
#Step 1: Remove movies with less ratings than threshold
r = ratings
r = r[r.timestamp >= 1059696000]
r = r.set_index('userId')
r['user_freq'] = r.index.value_counts()
r = r[(r.user_freq <= 2000)&(r.user_freq >= 40)]
r = r.reset_index()
r = r.set_index('movieId')
r['movie_freq'] = r.index.value_counts()
r = r[r.movie_freq >=35]
r = r.reset_index()
r = r.drop(['user_freq','movie_freq','timestamp'],axis=1)

r.shape

(20816, 3)

In [4]:
r_piv = r.pivot('movieId','userId','rating')
same_shape = False
while same_shape == False:
    train, test = train_test_split(r, train_size = 0.80)
    trainm = train.pivot('movieId','userId','rating')
    testm = test.pivot('movieId','userId','rating')
    if (trainm.shape == r_piv.shape) & (testm.shape == r_piv.shape):
        same_shape = True

k = 4
n_obs = len(train)
n = int(n_obs/k)

ind = r_piv.index
col = r_piv.columns
nmovies, nusers = r_piv.shape
mu = r_piv.mean(axis = 1)

In [5]:
dim_check = False
while dim_check == False:
    shuffled_train = train.reindex(np.random.permutation(train.index))
    for num in range(0,k):
        if num != k-1:
            cv = shuffled_train[num*n:(num+1)*n]
        else:
            cv = shuffled_train[num*n:]
        tr = shuffled_train.drop(cv.index)
        trm = tr.pivot('movieId','userId','rating')
        cvm = cv.pivot('movieId','userId','rating')
        #print(num, trainm.shape, trm.shape, cvm.shape)
        if (trm.shape != trainm.shape) | (cvm.shape != trainm.shape):
            dim_check = False
            break
        else:
            dim_check = True

In [6]:
Results = []
for trial in range(1,4):
    for nfeatures in [200,250,300,350,400]:
        for reg in [2.5,5,10]:
            alpha = .003 #learning rate
            cum_red = 0
            for num in range(0,k):
                X = np.random.randn(nmovies,nfeatures) #movie features
                Theta = np.random.randn(nusers,nfeatures) #user features
                
                if num != k-1:
                    cv = shuffled_train[num*n:(num+1)*n]
                else:
                    cv = shuffled_train[num*n:]
                tr = shuffled_train.drop(cv.index)
                trm = tr.pivot('movieId','userId','rating')
                cvm = cv.pivot('movieId','userId','rating')
                
                trm2 = trm.subtract(mu,axis=0)
                R = np.asarray(~np.isnan(trm2)) #Matrix containing boolean for rated/not rated for each movie user pairing
                trm2 = np.asarray(trm2.fillna(0))

                delta = 1
            
                J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
                while delta >= 0.001:
                    J_old = J_new
                    X_grad = np.dot(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R),Theta) + reg*X
                    Theta_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R)),X)+reg*Theta
                    X = X-alpha*X_grad
                    Theta = Theta-alpha*Theta_grad
                    J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
                    delta = (J_old-J_new)/J_old
                    if delta <0:
                        X = X+alpha*X_grad
                        Theta = Theta+alpha*Theta_grad
                        J_new = J_old
                        alpha = alpha/2
                        delta=1
                
                Predictions = pd.DataFrame(data = np.dot(X,np.transpose(Theta)),index = ind, columns = col)
                Predictions = Predictions.add(mu,axis = 0)
            
                RMSE_avg = math.sqrt(np.sum(np.sum((cvm.subtract(mu,axis = 0))**2))/len(cv))
                RMSE_alg = math.sqrt(np.sum(np.sum((Predictions - cvm)**2))/len(cv))
                cum_red += (RMSE_avg - RMSE_alg)/RMSE_avg*100
            
            red = cum_red/k
                        
            Results.append({'Trial': trial, 'Features': nfeatures, 'Reg. Parameter': reg, '% Reduction in RMSE': red})

Results = pd.DataFrame(Results)
Results = Results.set_index(['Trial','Features','Reg. Parameter'])
Results = Results.unstack('Trial')

In [7]:
Results

Unnamed: 0_level_0,Unnamed: 1_level_0,% Reduction in RMSE,% Reduction in RMSE,% Reduction in RMSE
Unnamed: 0_level_1,Trial,1,2,3
Features,Reg. Parameter,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
200,2.5,5.589384,5.535174,5.445152
200,5.0,6.840531,6.812381,6.725516
200,10.0,5.621169,5.664231,5.618512
250,2.5,5.986714,6.036809,6.181394
250,5.0,6.849052,6.78937,6.877346
250,10.0,5.616131,5.642231,5.636288
300,2.5,6.253209,6.111578,6.361749
300,5.0,6.972623,7.013077,6.915094
300,10.0,5.688899,5.635335,5.635817
350,2.5,5.305559,5.073112,5.030704


In [8]:
avg_red = np.mean(Results,axis=1)
avg_red

Features  Reg. Parameter
200       2.5               5.523237
          5.0               6.792809
          10.0              5.634637
250       2.5               6.068306
          5.0               6.838589
          10.0              5.631550
300       2.5               6.242179
          5.0               6.966931
          10.0              5.653350
350       2.5               5.136458
          5.0               6.344354
          10.0              5.194613
400       2.5               5.395967
          5.0               6.342074
          10.0              5.206668
dtype: float64

In [9]:
nfeatures, reg = np.argmax(avg_red)
nfeatures, reg

(300, 5.0)

In [10]:
trainm2 = trainm.subtract(mu, axis = 0)
R = np.asarray(~np.isnan(trainm2)) #Matrix containing boolean for rated/not rated for each movie user pairing
trainm2 = np.asarray(trainm2.fillna(0))

Results = []
for num in range(0,5):
    X = np.random.randn(nmovies,nfeatures) #movie features
    Theta = np.random.randn(nusers,nfeatures) #user features

    alpha = .003 #learning rate

    delta = 1
    iter = 0
    J = []
    J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trainm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
    J.append({'Iteration': iter, 'Cost': J_new})
    while delta >= 0.001:
        J_old = J_new
        iter = iter+1
        X_grad = np.dot(np.multiply(np.dot(X,np.transpose(Theta))-trainm2,R),Theta) + reg*X
        Theta_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(Theta))-trainm2,R)),X)+reg*Theta
        X = X-alpha*X_grad
        Theta = Theta-alpha*Theta_grad
        J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trainm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
        J.append({'Iteration': iter, 'Cost': J_new})
        delta = (J_old-J_new)/J_old
        if delta <0:
            X = X+alpha*X_grad
            Theta = Theta+alpha*Theta_grad
            J_new = J_old
            alpha = alpha/3
            delta=1
                
    Predictions = pd.DataFrame(data = np.dot(X,np.transpose(Theta)),index = ind, columns = col)
    Predictions = Predictions.add(mu,axis = 0)
    n_obs= len(testm)
            
    RMSE_avg = math.sqrt(np.sum(np.sum((testm.subtract(mu,axis = 0))**2))/n_obs)
    RMSE_alg = math.sqrt(np.sum(np.sum((Predictions - testm)**2))/n_obs)
    red = (RMSE_avg - RMSE_alg)/RMSE_avg*100
            
    Results.append(red)

Results

[7.322327762639662,
 7.496057426963909,
 7.559253376952613,
 7.316127679573875,
 7.358343049990447]

In [11]:
np.mean(Results)

7.4104218592241011

In [12]:
#PCA
var_explained = 0
iter = 0
while var_explained < 0.95:
    iter += 1
    pca = PCA(n_components=iter)
    movie_pca = pd.DataFrame(pca.fit_transform(X), index = ind)
    var_explained = np.sum(pca.explained_variance_ratio_)
    
var_explained = 0
iter = 0
while var_explained < 0.95:
    iter += 1
    pca = PCA(n_components=iter)
    user_pca = pd.DataFrame(pca.fit_transform(Theta), index = col)
    var_explained = np.sum(pca.explained_variance_ratio_)

In [13]:
#NORMALIZED Euclidean distance from other movies based on PCA of learned features

#Normalize movie features from 0 to 1, a.k.a. Feature Scaling
movie_pca_norm = (movie_pca-movie_pca.min(axis = 0))/(movie_pca.max(axis = 0) - movie_pca.min(axis =0))

dist = []
for num1 in range(0,nmovies):
    for num2 in range(0,nmovies):
        dist.append({'movieId1': ind[num1], 'movieId2': ind[num2], 'distance':math.sqrt(np.sum((movie_pca_norm[movie_pca_norm.index == ind[num1]].values-movie_pca_norm[movie_pca_norm.index == ind[num2]].values)**2))})

d = pd.DataFrame(dist)
d = d.set_index(['movieId1','movieId2'])
d = d[d.distance != 0]

In [14]:
d_mov = 4896
closest = d.loc[d_mov].sort_values('distance').head(10)
print("Movies closest to %s:" %movies.loc[d_mov].title)
closest = pd.merge(closest,movies,left_index = True, right_index = True).sort_values('distance')
closest

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001):


Unnamed: 0,distance,title,genres
551,2.045114,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical
40815,2.06506,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
5816,2.065709,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
68237,2.093137,Moon (2009),Drama|Mystery|Sci-Fi|Thriller
3000,2.09769,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy
1127,2.10464,"Abyss, The (1989)",Action|Adventure|Sci-Fi|Thriller
1617,2.11125,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
47610,2.1253,"Illusionist, The (2006)",Drama|Fantasy|Mystery|Romance
7458,2.133924,Troy (2004),Action|Adventure|Drama|War
2001,2.139426,Lethal Weapon 2 (1989),Action|Comedy|Crime|Drama


In [15]:
my_ratings = pd.DataFrame.from_csv('My Ratings.csv',index_col='movieId')
my_rat_mean_norm = my_ratings.subtract(mu,axis = 0)
R = np.asarray(~np.isnan(my_rat_mean_norm))
my_rat_mean_norm = np.asarray(my_rat_mean_norm.fillna(0))
my_features = np.random.randn(1,nfeatures)

In [16]:
alpha = .003 #learning rate

best_RMSE = 100
for reg in [1,2.5,5,10]:
    delta = 1
    J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2)
    while delta >= 0.001:
        J_old = J_new
        iter = iter+1
        my_features_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(my_features))-my_rat_mean_norm,R)),X) + reg*my_features
        my_features = my_features - alpha*my_features_grad
        J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2)
        delta = (J_old-J_new)/J_old
    my_predictions = pd.DataFrame(data = np.dot(X,np.transpose(my_features)),index = ind)
    my_predictions = my_predictions.add(mu,axis = 0)
    RMSE = np.sqrt(np.sum((my_predictions-my_ratings.values)**2)/np.sum(R)).values
    if RMSE < best_RMSE:
        best_RMSE = RMSE
        best_reg = reg

reg = best_reg
delta = 1
J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2)
while delta >= 0.001:
    J_old = J_new
    iter = iter+1
    my_features_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(my_features))-my_rat_mean_norm,R)),X) + reg*my_features
    my_features = my_features - alpha*my_features_grad
    J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2)
    delta = (J_old-J_new)/J_old
my_predictions = pd.DataFrame(data = np.dot(X,np.transpose(my_features)),index = ind,columns=['Predicted Rating'])
my_predictions = my_predictions.add(mu,axis = 0).merge(movies,left_index = True, right_index = True)
my_predictions.sort_values(by='Predicted Rating',ascending=False).head(10)

Unnamed: 0_level_0,Predicted Rating,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
318,5.237571,"Shawshank Redemption, The (1994)",Crime|Drama
2959,5.116613,Fight Club (1999),Action|Crime|Drama|Thriller
50,5.106159,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
2571,5.032582,"Matrix, The (1999)",Action|Sci-Fi|Thriller
48516,5.015751,"Departed, The (2006)",Crime|Drama|Thriller
293,4.956526,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller
296,4.902465,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
58559,4.837117,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
3000,4.829178,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy
68157,4.822656,Inglourious Basterds (2009),Action|Drama|War
