In [1]:
import os
import sys
import numpy as np
import pandas as pd

In [2]:
def save_pickle(filename, obj):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
curdir = os.getcwd()
prevdir = os.path.dirname(curdir)
folder = prevdir + '/datasets/ml-100k/'
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(folder + 'u.user', sep = '|', names = u_cols, encoding = 'latin-1')

In [4]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [31]:
item_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
             'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
             'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv(folder + 'u.item', sep='|', names = item_cols, encoding = 'latin-1')

In [32]:
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [33]:
movies = movies[['movie_id', 'title']]

In [34]:
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(folder + 'u.data', sep = '\t', names = rating_cols, encoding = 'latin-1')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [35]:
ratings.drop('timestamp', axis = 1, inplace = True)

In [36]:
from sklearn.model_selection import train_test_split

X = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state=42)

In [37]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [38]:
def baseline(user_id, movie_id):
    return 3.0 # always returns 3

In [13]:
def score(cf_model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return rmse(y_true, y_pred)

In [14]:
score(baseline)

1.2470926188539486

### User and Item based collaborative filtering model

In [15]:
r_matrix = X_train.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1673,1674,1675,1676,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [16]:
def movie_mean_rating(user_id, movie_id):
    # mean of movie's ratings
    if movie_id in r_matrix:
        mean_rating = r_matrix[movie_id].mean()
    else:
        mean_rating = 3.0
    return mean_rating

In [17]:
score(movie_mean_rating)

1.0234701463131335

In [18]:
r_matrix_dummy = r_matrix.copy().fillna(0)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
user_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
user_sim = pd.DataFrame(user_sim, index = r_matrix.index, columns = r_matrix.index)

In [20]:
user_sim

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.118076,0.029097,0.011628,0.264677,0.312419,0.308729,0.224269,0.026017,0.286411,...,0.308475,0.055872,0.197862,0.131367,0.152449,0.084456,0.293293,0.056765,0.103536,0.326491
2,0.118076,1.000000,0.099097,0.107680,0.034279,0.152789,0.086705,0.078864,0.068940,0.092399,...,0.086927,0.259636,0.289092,0.318824,0.149105,0.186347,0.168034,0.106748,0.136796,0.080358
3,0.029097,0.099097,1.000000,0.252131,0.026893,0.062539,0.039767,0.089474,0.078162,0.037670,...,0.040918,0.019031,0.065417,0.055373,0.086503,0.018418,0.096993,0.109631,0.092574,0.018987
4,0.011628,0.107680,0.252131,1.000000,0.000000,0.045543,0.078812,0.095354,0.059498,0.053879,...,0.024226,0.050703,0.056561,0.107294,0.098892,0.000000,0.132900,0.142798,0.097066,0.015176
5,0.264677,0.034279,0.026893,0.000000,1.000000,0.202843,0.299619,0.163724,0.038474,0.153021,...,0.262547,0.048524,0.048312,0.022202,0.091910,0.066000,0.156172,0.115842,0.124297,0.267574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.084456,0.186347,0.018418,0.000000,0.066000,0.092090,0.100625,0.129381,0.052699,0.033686,...,0.054072,0.192212,0.247926,0.229980,0.286229,1.000000,0.093007,0.151775,0.057504,0.121871
940,0.293293,0.168034,0.096993,0.132900,0.156172,0.261859,0.233843,0.188662,0.107486,0.197107,...,0.216670,0.072077,0.124657,0.173176,0.138989,0.093007,1.000000,0.101880,0.204524,0.185019
941,0.056765,0.106748,0.109631,0.142798,0.115842,0.097606,0.039199,0.121223,0.055766,0.085402,...,0.065280,0.130688,0.174143,0.131990,0.259213,0.151775,0.101880,1.000000,0.037286,0.077046
942,0.103536,0.136796,0.092574,0.097066,0.124297,0.206104,0.224227,0.083910,0.070065,0.118945,...,0.186477,0.080097,0.053648,0.110941,0.099172,0.057504,0.204524,0.037286,1.000000,0.097347


In [21]:
def movie_wmean_rating(user_id, movie_id):
    # looks at movies's ratings and computes a weighted mean,
    # with weights being similarity score between users
    if movie_id in r_matrix:
        sim_scores = user_sim[user_id]
        m_ratings = r_matrix[movie_id]
        idx = m_ratings[m_ratings.isnull()].index
        m_ratings = m_ratings.dropna()
        sim_scores = sim_scores.drop(idx) # remove nans from cosine_sim
        wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
    else:
        wmean_rating = 3.0
    return wmean_rating

In [22]:
score(movie_wmean_rating)

1.0174483808407588

In [23]:
def user_mean_rating(user_id, movie_id):
    # mean of user's ratings
    if user_id in r_matrix.index:
        user_mean = r_matrix.loc[user_id].mean()
    else:
        user_mean = 3.0
    return user_mean     

In [24]:
score(user_mean_rating)

1.0414500809451512

In [25]:
movie_sim = cosine_similarity(r_matrix_dummy.T, r_matrix_dummy.T)
movie_sim = pd.DataFrame(movie_sim, index = r_matrix.columns, columns = r_matrix.columns)
movie_sim

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1673,1674,1675,1676,1679,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.272575,0.208030,0.342182,0.240176,0.108727,0.462761,0.388734,0.370573,0.213452,...,0.0,0.0,0.0,0.040298,0.0,0.000000,0.000000,0.0,0.053731,0.053731
2,0.272575,1.000000,0.171620,0.372930,0.290607,0.064223,0.281536,0.275335,0.180932,0.082215,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
3,0.208030,0.171620,1.000000,0.234800,0.165214,0.116968,0.249724,0.164980,0.186063,0.101942,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.110883
4,0.342182,0.372930,0.234800,1.000000,0.312196,0.052487,0.372072,0.381440,0.309260,0.208993,...,0.0,0.0,0.0,0.000000,0.0,0.105176,0.105176,0.0,0.063105,0.000000
5,0.240176,0.290607,0.165214,0.312196,1.000000,0.048881,0.206252,0.252650,0.200827,0.048158,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.104765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,0.000000,0.000000,0.000000,0.105176,0.000000,0.000000,0.000000,0.000000,0.066676,0.094026,...,0.0,0.0,0.0,0.000000,0.0,1.000000,1.000000,0.0,0.000000,0.000000
1676,0.000000,0.000000,0.000000,0.105176,0.000000,0.000000,0.000000,0.000000,0.066676,0.094026,...,0.0,0.0,0.0,0.000000,0.0,1.000000,1.000000,0.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,1.0,0.000000,0.000000
1681,0.053731,0.000000,0.000000,0.063105,0.000000,0.000000,0.060732,0.092992,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,1.000000,0.000000


In [26]:
def user_wmean_rating(user_id, movie_id):
    # looks at user's ratings and computes a weighted mean,
    # with weights being similarity score between movies
    if user_id in r_matrix.index:
        try:
            sim_scores = movie_sim.loc[movie_id]
            m_ratings = r_matrix.loc[user_id]
            idx = m_ratings[m_ratings.isnull()].index
            m_ratings = m_ratings.dropna()
            sim_scores = sim_scores.drop(idx) # remove nans from cosine_sim
            wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
        except:
            wmean_rating = 3.0
    else:
        wmean_rating = 3.0
    return wmean_rating

In [27]:
score(user_wmean_rating)

1.0123212619297255

In [30]:
folder = prevdir + '/savefiles/'
save_pickle(folder + 'ratings.pkl', ratings)
save_pickle(folder + 'X_test.pkl', X_test)
save_pickle(folder + 'y_test.pkl', y_test)