### Recommender system 

Build prototype of recommender system by using different methods:

- Content-Based Filtering: make recommendation based on the info of the items previously rated by the user (items that are similar to those that a user liked in the past).

- Collaborative Filtering: make recommendation based on the info of a user by collecting preferences or taste information from many users (collaborating).

- Hybrid methods: combining collaborative filtering and content-based filtering, overcome some of the common problems in recommender systems such as cold start and the sparsity problem.


MovieLens 100K Dataset - https://grouplens.org/datasets/movielens/100k/

In [33]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel

In [15]:
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
 encoding='latin-1')

In [21]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [19]:
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [23]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### 1. Popularity model

In [27]:
## recommend the most popular (highest rating score) to all users, 
## here just use the average rating, to be more precise should use weighted rating instead

ratings.groupby(by='movie_id')['rating'].mean().sort_values(ascending=False).head(10)

movie_id
1293    5.0
1467    5.0
1653    5.0
814     5.0
1122    5.0
1599    5.0
1201    5.0
1189    5.0
1500    5.0
1536    5.0
Name: rating, dtype: float64

### 2. Content-based model

In [58]:
## recommend items only related to the user's previous choices

items_vectors = items.iloc[:, 5:]
print(items_vectors.shape)
cosine_sim = linear_kernel(items_vectors, items_vectors)
cosine_sim

(1682, 19)


array([[ 3.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  3.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  2.,  0.,  1.],
       [ 1.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.]])

In [57]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = items.index[items["movie title"]==title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return items['movie title'].iloc[movie_indices]

get_recommendations("Aladdin and the King of Thieves (1996)")

94                              Aladdin (1992)
421     Aladdin and the King of Thieves (1996)
819                           Space Jam (1996)
992                            Hercules (1997)
1218                     Goofy Movie, A (1995)
7                                  Babe (1995)
62                    Santa Clause, The (1994)
70                       Lion King, The (1994)
90      Nightmare Before Christmas, The (1993)
93                           Home Alone (1990)
Name: movie title, dtype: object

### 3. Collaborative model

In [111]:
R_df = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating')
users_mean=np.array(R_df.mean(axis=1))
R_demeaned=R_df.sub(R_df.mean(axis=1), axis=0)
R_demeaned=R_demeaned.fillna(0).as_matrix()
R_demeaned

array([[ 1.38970588, -0.61029412,  0.38970588, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.29032258,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.95454545,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.58928571,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [112]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [113]:
sigma = np.diag(sigma)
sigma

array([[ 21.39748528,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,  21.51489192,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,  21.60024573, ...,   0.        ,
          0.        ,   0.        ],
       ..., 
       [  0.        ,   0.        ,   0.        , ...,  38.78024093,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
         50.4747599 ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,  81.8363891 ]])

In [114]:
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(predicted_ratings, columns = R_df.columns)
preds_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,2.356474,0.567387,1.132433,0.215216,0.172427,0.922971,1.415046,-1.0476,2.370227,0.635447,...,0.588557,0.585404,0.599356,0.633159,0.588603,0.570645,0.583074,0.576859,0.582877,0.57127
1,0.201601,0.058093,0.03129,0.178051,0.103342,0.212558,0.217455,0.174024,0.307774,0.030055,...,0.132937,0.137573,0.135659,0.133302,0.136107,0.134454,0.136611,0.135532,0.136698,0.138289
2,0.152708,0.030284,0.247855,0.057874,0.003328,0.114596,-0.2132,0.028386,0.318968,0.0479,...,0.087601,0.089854,0.080119,0.0591,0.091407,0.089957,0.089785,0.089871,0.089345,0.092815
3,0.179193,0.10392,0.027701,-0.076217,0.073011,0.039944,0.110524,0.159236,-0.018295,0.066187,...,0.064044,0.0616,0.061742,0.061549,0.060724,0.058572,0.061644,0.060108,0.062074,0.062038
4,2.436861,0.010852,-0.275144,0.049883,0.289789,0.321914,1.225981,0.144662,0.175973,0.388434,...,0.306131,0.300234,0.30013,0.302484,0.301351,0.285044,0.298247,0.291646,0.299265,0.301786


In [115]:
def recommend_movies(predictions_df, user_id, movies_df, original_ratings_df, num_recommendations=10):
    
    user_row_number = user_id - 1
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    user_data = original_ratings_df[original_ratings_df.user_id == user_id]
    user_full = user_data.merge(movies_df, how='left', left_on='movie_id', right_on="movie_id").sort_values(['rating'], ascending=False)

    print("User {0} has already rated {1} movies.".format(user_id, user_full.shape[0]))
    print("Recommending the highest {0} predicted ratings movies not already rated.".format(num_recommendations))
    
    recommendations = (movies_df[~movies_df['movie_id'].isin(user_full['movie_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movie_id',
               right_on = 'movie_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

movies_df = items.rename(index=str, columns={"movie id": "movie_id"})
already_rated, predictions = recommend_movies(preds_df, 500, movies_df, ratings, 10)

User 500 has already rated 225 movies.
Recommending the highest 10 predicted ratings movies not already rated.


In [116]:
already_rated.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
112,500,268,5,883864840,Chasing Amy (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Chasing+Amy+(...,0,0,...,0,0,0,0,0,1,0,0,0,0
199,500,170,5,883874446,Cinema Paradiso (1988),01-Jan-1988,,http://us.imdb.com/M/title-exact?Nuovo%20cinem...,0,0,...,0,0,0,0,0,1,0,0,0,0
42,500,56,5,883873976,Pulp Fiction (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Pulp%20Fictio...,0,0,...,0,0,0,0,0,0,0,0,0,0
41,500,276,5,883865290,Leaving Las Vegas (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Leaving%20Las...,0,0,...,0,0,0,0,0,1,0,0,0,0
140,500,1160,5,883865483,Love! Valour! Compassion! (1997),16-May-1997,,http://us.imdb.com/Title?Love%21+Valour%21+Com...,0,0,...,0,0,0,0,0,1,0,0,0,0


In [117]:
predictions

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
363,508,"People vs. Larry Flynt, The (1996)",27-Dec-1996,,http://us.imdb.com/M/title-exact?People%20vs.%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
113,176,Aliens (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Aliens%20(1986),0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,0
181,273,Heat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Heat%20(1995),0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
487,654,Chinatown (1974),01-Jan-1974,,http://us.imdb.com/M/title-exact?Chinatown%20(...,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
373,521,"Deer Hunter, The (1978)",01-Jan-1978,,http://us.imdb.com/M/title-exact?Deer%20Hunter...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
466,631,"Crying Game, The (1992)",01-Jan-1992,,http://us.imdb.com/M/title-exact?Crying%20Game...,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
300,433,Heathers (1989),01-Jan-1989,,http://us.imdb.com/M/title-exact?Heathers%20(1...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
282,410,Kingpin (1996),12-Jul-1996,,http://us.imdb.com/M/title-exact?Kingpin%20(1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
567,748,"Saint, The (1997)",14-Mar-1997,,http://us.imdb.com/M/title-exact?Saint%2C%20Th...,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
212,324,Lost Highway (1997),21-Feb-1997,,http://us.imdb.com/Title?Lost+Highway+(1997),0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
