In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.cross_validation import train_test_split
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
ratings = pd.DataFrame.from_csv('ratings.csv',index_col=None)
movies = pd.DataFrame.from_csv('movies.csv',index_col='movieId')
ratings.rating = ratings.rating.astype('float16')
ratings.userId = ratings.userId.astype('int32')
ratings.movieId = ratings.movieId.astype('int32')

In [3]:
#Step 1: Remove movies with less ratings than threshold
r=ratings
r['movie_freq'] = r.groupby('movieId')['movieId'].transform('count')
r = r[r.movie_freq>=25]
r['user_freq'] = r.groupby('userId')['userId'].transform('count')
r = r[r.user_freq>=20]
r = r[['userId','movieId','rating']]

r.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(64012, 3)

In [4]:
Results = []
for split in [1,2,3,4,5]:
    same_shape = False

    while same_shape == False:
        tr, ntr = train_test_split(r, train_size = 0.6)#, random_state = 0)
        test, cv = train_test_split(ntr, train_size = 0.5)#, random_state = 0)
        trm = tr.pivot('movieId','userId','rating')
        testm = test.pivot('movieId','userId','rating')
        cvm = cv.pivot('movieId','userId','rating')
        if (trm.shape == testm.shape) & (trm.shape==cvm.shape):
            same_shape = True
    
    ind = trm.index
    col = trm.columns
    
    mu = trm.mean(axis = 1)
    trm2 = trm.subtract(mu,axis=0)
    R = np.asarray(~np.isnan(trm2)) #Matrix containing boolean for rated/not rated for each movie user pairing
    trm2 = np.asarray(trm2.fillna(0))
    
    nmovies, nusers = trm2.shape
    for nfeatures in [100,150,200,250,300]:
        X = np.random.randn(nmovies,nfeatures) #movie features
        Theta = np.random.randn(nusers,nfeatures) #user features

        for reg in [8,9,10,11,12]:
            alpha = .003 #learning rate

            delta = 1
            iter = 0
            J = []
            J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
            J.append({'Iteration': iter, 'Cost': J_new})
            while delta >= 0.001:
                J_old = J_new
                iter = iter+1
                X_grad = np.dot(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R),Theta) + reg*X
                Theta_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R)),X)+reg*Theta
                X = X-alpha*X_grad
                Theta = Theta-alpha*Theta_grad
                J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
                J.append({'Iteration': iter, 'Cost': J_new})
                delta = (J_old-J_new)/J_old
                if delta <0:
                    X = X+alpha*X_grad
                    Theta = Theta+alpha*Theta_grad
                    J_new = J_old
                    alpha = alpha/3
                    delta=1
                
            Predictions = pd.DataFrame(data = np.dot(X,np.transpose(Theta)),index = ind, columns = col)
            Predictions = Predictions.add(mu,axis = 0)
            n_obs= np.sum(np.sum(~np.isnan(cvm)))
            
            RMSE_avg = math.sqrt(np.sum(np.sum((cvm.subtract(mu,axis = 0))**2))/n_obs)
            RMSE_alg = math.sqrt(np.sum(np.sum((Predictions - cvm)**2))/n_obs)
            red = (RMSE_avg - RMSE_alg)/RMSE_avg*100
            
            Results.append({'Split': split, 'Features': nfeatures, 'Reg. Parameter': reg, '% Reduction in RMSE': red})

Results = pd.DataFrame(Results)
Results = Results.set_index(['Split','Features','Reg. Parameter'])
Results = Results.unstack('Split')
Results

Unnamed: 0_level_0,Unnamed: 1_level_0,% Reduction in RMSE,% Reduction in RMSE,% Reduction in RMSE,% Reduction in RMSE,% Reduction in RMSE
Unnamed: 0_level_1,Split,1,2,3,4,5
Features,Reg. Parameter,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
100,8,6.838836,7.022563,7.21795,7.328828,7.178802
100,9,6.953802,7.118341,7.310323,7.427457,7.254374
100,10,6.999422,7.149232,7.335751,7.459335,7.258263
100,11,6.974733,7.110964,7.291888,7.426106,7.193998
100,12,6.880414,7.009122,7.184757,7.326948,7.069606
150,8,7.123414,7.169282,7.40829,7.755142,7.12781
150,9,7.187129,7.239402,7.460552,7.796579,7.205598
150,10,7.179574,7.232139,7.444445,7.75202,7.211728
150,11,7.103396,7.157833,7.363865,7.642162,7.146729
150,12,6.970637,7.027834,7.230131,7.477122,7.022718


In [5]:
avg_red = np.mean(Results,axis=1)
avg_red

Features  Reg. Parameter
100       8                 7.117396
          9                 7.212859
          10                7.240400
          11                7.199538
          12                7.094170
150       8                 7.316787
          9                 7.377852
          10                7.363981
          11                7.282797
          12                7.145689
200       8                 7.374814
          9                 7.414886
          10                7.382379
          11                7.288245
          12                7.144988
250       8                 6.716392
          9                 7.370874
          10                7.372900
          11                7.287831
          12                7.143498
300       8                 6.316141
          9                 7.399373
          10                7.391471
          11                7.298514
          12                7.149820
dtype: float64

In [6]:
nfeatures, reg = np.argmax(avg_red)

In [7]:
nfeatures

200

In [8]:
reg

9

In [9]:
Results = []
for num in range(0,5):
    same_shape = False

    while same_shape == False:
        tr, ntr = train_test_split(r, train_size = 0.6)#, random_state = 0)
        test, cv = train_test_split(ntr, train_size = 0.5)#, random_state = 0)
        trm = tr.pivot('movieId','userId','rating')
        testm = test.pivot('movieId','userId','rating')
        cvm = cv.pivot('movieId','userId','rating')
        if (trm.shape == testm.shape) & (trm.shape==cvm.shape):
            same_shape = True
    
    ind = trm.index
    col = trm.columns
    
    mu = trm.mean(axis = 1)
    trm2 = trm.subtract(mu,axis=0)
    R = np.asarray(~np.isnan(trm2)) #Matrix containing boolean for rated/not rated for each movie user pairing
    trm2 = np.asarray(trm2.fillna(0))
    
    nmovies, nusers = trm2.shape
    
    X = np.random.randn(nmovies,nfeatures) #movie features
    Theta = np.random.randn(nusers,nfeatures) #user features

    alpha = .003 #learning rate

    delta = 1
    iter = 0
    J = []
    J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
    J.append({'Iteration': iter, 'Cost': J_new})
    while delta >= 0.001:
        J_old = J_new
        iter = iter+1
        X_grad = np.dot(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R),Theta) + reg*X
        Theta_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R)),X)+reg*Theta
        X = X-alpha*X_grad
        Theta = Theta-alpha*Theta_grad
        J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
        J.append({'Iteration': iter, 'Cost': J_new})
        delta = (J_old-J_new)/J_old
        if delta <0:
            X = X+alpha*X_grad
            Theta = Theta+alpha*Theta_grad
            J_new = J_old
            alpha = alpha/3
            delta=1
                
    Predictions = pd.DataFrame(data = np.dot(X,np.transpose(Theta)),index = ind, columns = col)
    Predictions = Predictions.add(mu,axis = 0)
    n_obs= np.sum(np.sum(~np.isnan(testm)))
            
    RMSE_avg = math.sqrt(np.sum(np.sum((testm.subtract(mu,axis = 0))**2))/n_obs)
    RMSE_alg = math.sqrt(np.sum(np.sum((Predictions - testm)**2))/n_obs)
    red = (RMSE_avg - RMSE_alg)/RMSE_avg*100
            
    Results.append(red)

Results

[7.45469812332521,
 7.488790714612242,
 7.433181142410686,
 7.183252145446538,
 7.329694424397634]

In [10]:
avg_red = np.mean(Results)
avg_red

7.3779233100384616