"""
1.Remove or otherwise handle movies with few ratings. A movie with less than a certain threshold number of ratings will likely fail to develop appropriate features.
2.Separate into training, test, cross-validation sets.
3.Restructuring the existing data set from a list of user ID, movie ID, and ratings to a matrix containing the ratings from a user for a specific movie.
4.Apply mean normalization to all of the ratings by movie.
5.Initialize random features for users and movies - size will be selected based on cross-validation accuracy
6.Applying a collaborative filtering algorithm with gradient descent should be able to resolve features for both the movies and the users.
7.Use features to form recommendations by either predicting a user’s rating or determining similarity between movies.
"""

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

In [2]:
ratings = pd.DataFrame.from_csv('ratings.csv',index_col=None)
#movies = pd.DataFrame.from_csv('movies.csv',index_col=None)

In [3]:
ratings.userId.value_counts().tail() #Check minimum number of ratings submitted by users

645    20
58     20
497    20
313    20
350    20
Name: userId, dtype: int64

In [4]:
ratings.movieId.value_counts().tail() #Check minimum number of ratings per movie

8612     1
61950    1
2593     1
8740     1
2049     1
Name: movieId, dtype: int64

In [5]:
#Step 1: Remove movies with less ratings than threshold = 10
r=ratings
r['movie_freq'] = r.groupby('movieId')['movieId'].transform('count')
r = r[r.movie_freq>=50]
r['user_freq'] = r.groupby('userId')['userId'].transform('count')
r = r[r.user_freq>=25]
r.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(40478, 6)

In [6]:
#Step 2: Separate into training, test, cross-validation sets

tr,test = train_test_split(r, train_size = 0.7)
#tr, ntr = train_test_split(r, train_size = 0.6)#, random_state = 0)
#test, cv = train_test_split(ntr, train_size = 0.5)#, random_state = 0)

In [7]:
#Step 3: Restructure all data sets into a matrix
trm = tr.pivot('movieId','userId','rating')
testm = test.pivot('movieId','userId','rating')
#cvm = cv.pivot('movieId','userId','rating')
ind = trm.index
col = trm.columns

In [8]:
trm.shape

(455, 465)

In [9]:
testm.shape

(455, 465)

In [10]:
#cvm.shape

In [11]:
#Step 4: Mean Normalization on training set.
mu = trm.mean(axis = 1)
trm2 = trm.subtract(mu,axis=0)
R = np.asarray(~np.isnan(trm2)) #Matrix containing boolean for rated/not rated for each movie user pairing
trm2 = np.asarray(trm2.fillna(0))
trm2

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.95486111],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.33606557],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -1.28333333],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.41666667],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.60185185]])

In [12]:
#Step 5: Initialize parameters for users and movies - # of parameters will be manually selected based on cross-validation accuracy
nmovies, nusers = trm2.shape
nfeatures = 100

#np.random.seed(0)
X = np.random.randn(nmovies,nfeatures) #movie features
Theta = np.random.randn(nusers,nfeatures) #user features

reg = 10 #regularization parameter
alpha = .003 #learning rate


In [13]:
J = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
J

1882406.7514158688

In [14]:
for num in range(0,100):
    J = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2)))
    X_grad = np.dot(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R),Theta) + reg*X
    Theta_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R)),X)+reg*Theta
    X = X-alpha*X_grad
    Theta = Theta-alpha*Theta_grad
    
J

10597.915308113956

In [15]:
Predictions = pd.DataFrame(data = np.dot(X,np.transpose(Theta)),index = ind, columns = col)
Predictions = (Predictions.add(mu,axis = 0)*2).round()/2
Predictions

Unnamed: 0_level_0,1,3,4,6,7,8,9,11,16,17,...,655,656,659,661,662,664,665,666,667,668
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,4.0,4.0,4.0,4.0,4.0,3.5,4.0,4.5,4.5,...,4.5,4.0,4.0,4.0,4.5,4.0,4.0,4.0,4.0,3.0
2,3.5,3.5,3.5,3.5,3.5,3.5,3.0,3.5,3.5,3.5,...,3.5,3.5,3.0,3.5,3.5,3.5,3.0,3.0,3.5,3.0
3,3.5,3.5,3.5,3.5,3.5,3.5,3.0,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.0,3.0,3.5,3.0
5,3.5,3.0,3.5,3.5,3.5,3.0,3.0,3.5,3.5,3.5,...,3.5,3.5,3.0,3.5,3.5,3.5,3.0,3.0,3.5,3.0
6,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.5,...,4.0,4.0,3.5,4.0,4.5,4.0,4.0,4.0,4.0,4.0
7,3.5,3.5,3.5,3.5,3.5,3.5,3.0,3.5,3.5,3.5,...,3.5,3.5,3.0,3.5,3.5,3.5,3.0,3.0,3.5,3.0
10,3.5,3.5,3.5,3.5,4.0,3.5,3.5,3.5,4.0,4.0,...,3.5,3.5,3.5,3.5,4.0,4.0,3.5,3.5,3.5,3.5
11,4.0,4.0,4.0,4.0,4.0,4.0,3.5,4.0,4.0,4.0,...,4.0,4.0,3.5,4.0,4.0,4.0,4.0,3.5,4.0,3.5
16,4.0,4.0,3.5,4.0,4.0,3.5,3.5,3.5,4.0,4.0,...,3.5,3.5,3.5,4.0,4.0,3.5,3.5,4.0,4.0,3.5
17,4.0,4.0,4.0,4.0,4.0,4.0,3.5,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.5,4.0,3.5,4.0,4.0,3.5


In [16]:
#compare to test set
R=~np.isnan(testm)
Accuracy = np.sum(np.sum(Predictions == testm))/np.sum(np.sum(R))
Accuracy

0.25922266139657446

In [17]:
R=~np.isnan(testm)
Accuracy = (np.sum(np.sum(Predictions == testm)) + np.sum(np.sum(Predictions+0.5 ==testm)) + np.sum(np.sum(Predictions-0.5==testm)))/np.sum(np.sum(R))
Accuracy

0.6604084321475626