In [1]:
import numpy as np
import pandas as pd

### Preparing the Data

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

In [3]:
 i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    
movies = pd.read_csv('ml-100k/u.item', sep = '|', names = i_cols, 
                    encoding = 'latin-1')

In [4]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
movies = movies[['movie_id', 'title']]

In [7]:
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [8]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names = r_cols,
                     encoding = 'latin-1')

In [9]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [10]:
ratings = ratings.drop('timestamp', axis = 1)

In [11]:
from sklearn.model_selection import train_test_split
X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, 
                                                   stratify = y, random_state = 2)

### Defining the Scoring Metrics

In [12]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Baseline model that always return a specific value
def baseline(user_id, movie_id):
    return 3

## User Based Collaborative Filtering

In [13]:
def score(cf_model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    
    return rmse(y_true, y_pred)

In [14]:
score(baseline)

1.2388058766408885

In [15]:
rating_matrix = X_train.pivot_table(values = 'rating', index = 'user_id', 
                                    columns = 'movie_id')
rating_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1672,1673,1674,1675,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,,1.0,,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


### Imputing mean

In [16]:
def cf_user_mean(user_id, movie_id):
    if movie_id in rating_matrix:
        mean_rating = rating_matrix[movie_id].mean()
    else:
        mean_rating = 3.0
    return mean_rating

In [17]:
score(cf_user_mean)

1.0223262628623488

### Calculating Cosine Similarity

In [18]:
r_matrix = rating_matrix.copy().fillna(0)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(r_matrix, r_matrix)

In [20]:
cos_sim = pd.DataFrame(cos_sim, index = r_matrix.index, 
                       columns = r_matrix.index)
cos_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.148732,0.036723,0.051283,0.271427,0.324147,0.335772,0.239905,0.085599,0.257408,...,0.291098,0.110654,0.198893,0.109494,0.149063,0.078217,0.213679,0.121421,0.116045,0.349138
2,0.148732,1.0,0.050843,0.084561,0.051583,0.192253,0.077108,0.062501,0.116444,0.131803,...,0.08832,0.351616,0.290512,0.168143,0.2386,0.276952,0.162155,0.14232,0.130195,0.092587
3,0.036723,0.050843,1.0,0.258263,0.026793,0.042833,0.055507,0.059794,0.052785,0.032861,...,0.008897,0.03542,0.093803,0.012477,0.07103,0.017982,0.130269,0.0,0.141829,0.033317
4,0.051283,0.084561,0.258263,1.0,0.043615,0.075565,0.067635,0.113914,0.133333,0.048827,...,0.046195,0.049705,0.05732,0.140069,0.107984,0.040374,0.164738,0.06323,0.182206,0.042746
5,0.271427,0.051583,0.026793,0.043615,1.0,0.154233,0.290117,0.167517,0.04368,0.137341,...,0.257041,0.088435,0.079282,0.041297,0.166366,0.070807,0.145269,0.130177,0.15604,0.207111


### Using weighted mean

In [21]:
def cf_user_wmean(user_id, movie_id):
    if movie_id in rating_matrix:
        sim_scores = cos_sim[user_id]
        m_ratings = rating_matrix[movie_id]
        
        idx = m_ratings[m_ratings.isnull()].index
        m_ratings = m_ratings.dropna()
        sim_scores = sim_scores.drop(idx)
        
        wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
    else:
        wmean_rating = 3.0
    return wmean_rating

In [22]:
score(cf_user_wmean)

1.0169465177567785

## Item Based Colaborative Filtering

In [23]:
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate

In [24]:
reader = Reader()
#The reader object helps in parsing file or dataframe contatining ratings

In [25]:
#Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)

### Basic KNN Model

In [26]:
knn = KNNBasic()

In [27]:
cross_validate(knn, data, measures = ['RMSE'], cv = 5, verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9778  0.9825  0.9832  0.9709  0.9794  0.9788  0.0044  
Fit time          0.82    0.42    0.42    0.41    0.41    0.50    0.16    
Test time         2.51    2.66    2.60    2.62    2.68    2.61    0.06    


{'test_rmse': array([0.97783088, 0.98245254, 0.98324043, 0.97094038, 0.97935637]),
 'fit_time': (0.8203113079071045,
  0.41875362396240234,
  0.4225621223449707,
  0.4098501205444336,
  0.40616798400878906),
 'test_time': (2.5089709758758545,
  2.6641781330108643,
  2.60333251953125,
  2.6159815788269043,
  2.6798501014709473)}

### SVD Model

In [30]:
from surprise import SVD
svd = SVD()
cross_validate(svd, data, measures = ["RMSE"], cv = 5)

{'test_rmse': array([0.92868184, 0.93899624, 0.9375922 , 0.94155973, 0.93634794]),
 'fit_time': (4.03078293800354,
  3.9249560832977295,
  3.9503118991851807,
  3.8940110206604004,
  3.8745930194854736),
 'test_time': (0.0937490463256836,
  0.10935258865356445,
  0.0987861156463623,
  0.1093752384185791,
  0.10937714576721191)}

### Knn with means model

In [32]:
from surprise import KNNWithMeans
knnwm = KNNWithMeans()
cross_validate(knnwm, data, measures = ["RMSE"], cv = 5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.958699  , 0.95280638, 0.94245018, 0.94524958, 0.95611602]),
 'fit_time': (0.938676118850708,
  1.0142953395843506,
  1.0505383014678955,
  1.050297737121582,
  1.0627665519714355),
 'test_time': (6.582884788513184,
  6.6686928272247314,
  6.512271165847778,
  6.587335109710693,
  6.614750385284424)}

### SVD++ Model

In [37]:
from surprise import SVDpp
svd1 = SVDpp()

In [38]:
cross_validate(svd1, data, measures = ["RMSE"], cv = 5, verbose = True)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9141  0.9233  0.9177  0.9205  0.9208  0.9193  0.0031  
Fit time          144.98  128.91  129.53  129.69  130.87  132.80  6.12    
Test time         2.33    2.42    2.31    2.30    2.51    2.37    0.08    


{'test_rmse': array([0.91409131, 0.92331492, 0.91773088, 0.92049346, 0.92081121]),
 'fit_time': (144.97847485542297,
  128.91072010993958,
  129.52845668792725,
  129.69198775291443,
  130.87090039253235),
 'test_time': (2.3268749713897705,
  2.4238228797912598,
  2.31197452545166,
  2.2999277114868164,
  2.505075693130493)}