### Recommender system 

Build prototype of recommender system by using different methods:

- Content-Based Filtering: make recommendation based on the info of the items previously rated by the user (items that are similar to those that a user liked in the past).

- Collaborative Filtering: make recommendation based on the info of a user by collecting preferences or taste information from many users (collaborating).

- Hybrid methods: combining collaborative filtering and content-based filtering, overcome some of the common problems in recommender systems such as cold start and the sparsity problem.


MovieLens 100K Dataset - https://grouplens.org/datasets/movielens/100k/

In [33]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel

In [15]:
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
 encoding='latin-1')

In [21]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [19]:
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [23]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### 1. Popularity model

In [27]:
## recommend the most popular (highest rating score) to all users, 
## here just use the average rating, to be more precise should use weighted rating instead

ratings.groupby(by='movie_id')['rating'].mean().sort_values(ascending=False).head(10)

movie_id
1293    5.0
1467    5.0
1653    5.0
814     5.0
1122    5.0
1599    5.0
1201    5.0
1189    5.0
1500    5.0
1536    5.0
Name: rating, dtype: float64

### 2. Content-based model

In [58]:
## recommend items only related to the user's previous choices

items_vectors = items.iloc[:, 5:]
print(items_vectors.shape)
cosine_sim = linear_kernel(items_vectors, items_vectors)
cosine_sim

(1682, 19)


array([[ 3.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  3.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  2.,  0.,  1.],
       [ 1.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.]])

In [57]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = items.index[items["movie title"]==title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return items['movie title'].iloc[movie_indices]

get_recommendations("Aladdin and the King of Thieves (1996)")

94                              Aladdin (1992)
421     Aladdin and the King of Thieves (1996)
819                           Space Jam (1996)
992                            Hercules (1997)
1218                     Goofy Movie, A (1995)
7                                  Babe (1995)
62                    Santa Clause, The (1994)
70                       Lion King, The (1994)
90      Nightmare Before Christmas, The (1993)
93                           Home Alone (1990)
Name: movie title, dtype: object

### 3. Collaborative model

In [59]:
R_df = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
R_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
print(user_ratings_mean.reshape(-1,1).shape)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)
R_demeaned

(943, 1)


array([[ 4.41617122,  2.41617122,  3.41617122, ..., -0.58382878,
        -0.58382878, -0.58382878],
       [ 3.86325803, -0.13674197, -0.13674197, ..., -0.13674197,
        -0.13674197, -0.13674197],
       [-0.08977408, -0.08977408, -0.08977408, ..., -0.08977408,
        -0.08977408, -0.08977408],
       ..., 
       [ 4.9470868 , -0.0529132 , -0.0529132 , ..., -0.0529132 ,
        -0.0529132 , -0.0529132 ],
       [-0.20035672, -0.20035672, -0.20035672, ..., -0.20035672,
        -0.20035672, -0.20035672],
       [-0.34066587,  4.65933413, -0.34066587, ..., -0.34066587,
        -0.34066587, -0.34066587]])

In [76]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [77]:
sigma = np.diag(sigma)
sigma

array([[  59.07162971,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,   59.40537828,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,   59.48739171, ...,    0.        ,
           0.        ,    0.        ],
       ..., 
       [   0.        ,    0.        ,    0.        , ...,  217.3553494 ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
         244.38398351,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,  522.23169772]])

In [86]:
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(predicted_ratings, columns = R_df.columns)
preds_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,6.488436,2.959503,1.634987,3.024467,1.656526,1.659506,3.630469,0.240669,1.791518,3.347816,...,0.011976,-0.092017,-0.074553,-0.060985,0.009427,-0.035641,-0.039227,-0.037434,-0.025552,0.023513
1,2.347262,0.129689,-0.098917,0.328828,0.159517,0.481361,0.213002,0.097908,1.8921,0.671,...,0.003943,-0.026939,-0.03546,-0.029883,-0.027153,-0.015244,-0.008277,-0.01176,0.011639,-0.046924
2,0.291905,-0.26383,-0.151454,-0.179289,0.013462,-0.088309,-0.057624,0.568764,-0.018506,0.280742,...,-0.028964,-0.031622,0.045513,0.026089,-0.021705,0.002282,0.032363,0.017322,-0.006644,-0.00948
3,0.36641,-0.443535,0.041151,-0.007616,0.055373,-0.080352,0.299015,-0.010882,-0.160888,-0.118834,...,0.020069,0.015981,-0.000182,0.005593,0.026634,0.023562,0.036405,0.029984,0.015612,-0.008713
4,4.263488,1.937122,0.052529,1.04935,0.652765,0.002836,1.730461,0.870584,0.341027,0.569055,...,0.019973,-0.053521,-0.017242,-0.007137,-0.038987,0.010338,0.004869,0.007603,-0.020575,0.00333


In [106]:
def recommend_movies(predictions_df, user_id, movies_df, original_ratings_df, num_recommendations=10):
    
    user_row_number = user_id - 1
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    user_data = original_ratings_df[original_ratings_df.user_id == user_id]
    user_full = user_data.merge(movies_df, how='left', left_on='movie_id', right_on="movie_id").sort_values(['rating'], ascending=False)

    print("User {0} has already rated {1} movies.".format(user_id, user_full.shape[0]))
    print("Recommending the highest {0} predicted ratings movies not already rated.".format(num_recommendations))
    
    recommendations = (movies_df[~movies_df['movie_id'].isin(user_full['movie_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movie_id',
               right_on = 'movie_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

movies_df = items.rename(index=str, columns={"movie id": "movie_id"})
already_rated, predictions = recommend_movies(preds_df, 500, movies_df, ratings, 10)

User 500 has already rated 225 movies.
Recommending the highest 10 predicted ratings movies not already rated.


In [108]:
already_rated.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
112,500,268,5,883864840,Chasing Amy (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Chasing+Amy+(...,0,0,...,0,0,0,0,0,1,0,0,0,0
199,500,170,5,883874446,Cinema Paradiso (1988),01-Jan-1988,,http://us.imdb.com/M/title-exact?Nuovo%20cinem...,0,0,...,0,0,0,0,0,1,0,0,0,0
42,500,56,5,883873976,Pulp Fiction (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Pulp%20Fictio...,0,0,...,0,0,0,0,0,0,0,0,0,0
41,500,276,5,883865290,Leaving Las Vegas (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Leaving%20Las...,0,0,...,0,0,0,0,0,1,0,0,0,0
140,500,1160,5,883865483,Love! Valour! Compassion! (1997),16-May-1997,,http://us.imdb.com/Title?Love%21+Valour%21+Com...,0,0,...,0,0,0,0,0,1,0,0,0,0


In [110]:
predictions

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
6,14,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il...",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
363,508,"People vs. Larry Flynt, The (1996)",27-Dec-1996,,http://us.imdb.com/M/title-exact?People%20vs.%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,65,What's Eating Gilbert Grape (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?What's%20Eati...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
139,212,"Unbearable Lightness of Being, The (1988)",01-Jan-1988,,http://us.imdb.com/M/title-exact?Unbearable%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,150,Swingers (1996),18-Oct-1996,,http://us.imdb.com/M/title-exact?Swingers%20(1...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
145,221,Breaking the Waves (1996),15-Nov-1996,,http://us.imdb.com/M/title-exact?Breaking%20th...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
366,512,Wings of Desire (1987),01-Jan-1987,,"http://us.imdb.com/Title?Himmel+%FCber+Berlin,...",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
163,248,Grosse Pointe Blank (1997),11-Apr-1997,,http://us.imdb.com/M/title-exact?Grosse%20Poin...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,20,Angels and Insects (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Angels%20and%...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
142,218,Cape Fear (1991),01-Jan-1991,,http://us.imdb.com/M/title-exact?Cape%20Fear%2...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
