In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import mean_squared_error,pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine, correlation
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading ratings file
ratings = pd.read_csv('data/ratings.dat', sep='::', encoding='latin-1', names=['userId','movieId','rating','timestamp'])

# Reading movies file
movies = pd.read_csv('data/movies.dat', sep='::', encoding='latin-1', names=['movieId','title','genres'])

In [3]:
X = ratings.copy()
y = ratings['userId']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 125247)

In [13]:
kf = KFold(n_splits=2)
fold_ = 1 
for (train_idx,test_idx) in kf.split(X):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    
    #r_matrix = train_matrix(X.iloc[train_idx])
    #cosine_sim = weighted_mean(r_matrix)
    #score_fold = score(cf_user_wmean)
    #score_fold = score(cf_user_mean)
    print ("Fold ...: {}".format(fold_))
    fold_ = fold_ + 1

Fold ...: 1
Fold ...: 2


In [14]:
X_train.shape

(500105, 4)

In [15]:
X_test.shape

(500104, 4)

In [16]:
r_matrix = X_train.pivot_table(values='rating', index='userId', columns='movieId')
r_matrix = r_matrix.fillna(0)
r_matrix.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
10,5.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0


In [17]:
cosine_sim = cosine_similarity(r_matrix, r_matrix)

#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)


In [18]:
r_matrix.shape

(3070, 3618)

In [19]:
cosine_sim.shape

(3070, 3070)

In [20]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [21]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    id_pairs = zip(X_test['userId'], X_test['movieId']) #Construct a list of user-movie tuples from the testing dataset
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs]) #Predict the rating for every user-movie tuple
    y_true = np.array(X_test['rating']) #Extract the actual ratings given by the users in the test data
    return rmse(y_true, y_pred) #Return the final RMSE score

In [24]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie_id):
    if movie_id in r_matrix:  #Check if movie_id exists in r_matrix
        mean_rating = r_matrix[movie_id].mean() #Compute the mean of all the ratings given to the movie
    else:
        mean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    return mean_rating

In [22]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    if movie_id in r_matrix: #Check if movie_id exists in r_matrix
        sim_scores = cosine_sim[user_id] #Get the similarity scores for the user in question with every other user
        m_ratings = r_matrix[movie_id] #Get the user ratings for the movie in question
        idx = m_ratings[m_ratings.isnull()].index #Extract the indices containing NaN in the m_ratings series
        m_ratings = m_ratings.dropna() #Drop the NaN values from the m_ratings Series
        sim_scores = sim_scores.drop(idx) #Drop the corresponding cosine scores from the sim_scores series
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum() #Compute the final weighted mean
    else:
        wmean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    return wmean_rating

In [25]:
score(cf_user_mean)

3.2699391921094083