# Movie Recommendation

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

movies = pd.read_csv("movies.csv", encoding = "ISO-8859-1")
users = pd.read_csv("users.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
95,96,In the Bleak Midwinter (1995),Comedy
96,97,Hate (Haine,
97,98,Shopping (1994),Action|Thriller
98,99,Heidi Fleiss: Hollywood Madam (1995),Documentary


In [5]:
users

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer
...,...,...,...,...,...,...,...
95,96,F,25,16,78028,25-34,self-employed
96,97,F,35,3,66210,35-44,clerical/admin
97,98,F,35,7,33547,35-44,executive/managerial
98,99,F,1,10,19390,Under 18,K-12 student


In [6]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1,3
1,1,3,5
2,1,4,4
3,1,5,3
4,1,6,3
...,...,...,...
7507,100,96,4
7508,100,97,4
7509,100,98,5
7510,100,99,5


## 1. Content-based Recommendation Model

### Find list of used genres which is used to category the movies.

In [25]:
movieGenres = [x.split('|') for x in list(movies["genres"].dropna())]

listGen = []

for movie in movieGenres:
    for genre in movie:
        if genre not in listGen:
            listGen.append(genre)
            
print(listGen)

['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Sci-Fi', 'Documentary', 'War', 'Musical']


### Vectorize the relationship between movies and genres and put them into Ij

In [77]:
Ij = []

# for genres in movies["genres"].dropna():
for genres in movies["genres"]:

    vector = np.zeros(len(listGen))
    
    # If string of genres, add them to vector
    if( isinstance(genres, str) ):
        for genre in genres.split("|"):
            vector[listGen.index(genre)] = 1
            
    # Else just add zero'd vector
    Ij.append(vector)

Ij[:4]


[array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])]

### Vectorize the relationship between users and genres and put them into Uj (if user rate for a movie, he/she has the related history with the movie's genres

In [87]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1,3
1,1,3,5
2,1,4,4
3,1,5,3
4,1,6,3
...,...,...,...
7507,100,96,4
7508,100,97,4
7509,100,98,5
7510,100,99,5


In [137]:
Uj = [np.zeros(len(listGen))] * 100

for rating in ratings.iterrows():
    user_id, movie_id, rating = rating[1][0], rating[1][1], rating[1][2]
    Uj[user_id-1] = Ij[movie_id-1] + Uj[user_id-1]

# Bring back to 1's and 0's
for user in Uj:
    for i, genre in enumerate(user):
        if(genre > 0):
            user[i] = 1

In [139]:
Uj[:4]

[array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.])]

### Compute the cosine similarity between movies and users

In [142]:
cosine_similarity(Uj,Ij)

array([[0.46291005, 0.46291005, 0.37796447, ..., 0.37796447, 0.26726124,
        0.37796447],
       [0.46291005, 0.46291005, 0.37796447, ..., 0.37796447, 0.26726124,
        0.37796447],
       [0.4472136 , 0.4472136 , 0.36514837, ..., 0.36514837, 0.25819889,
        0.36514837],
       ...,
       [0.46291005, 0.46291005, 0.37796447, ..., 0.37796447, 0.26726124,
        0.37796447],
       [0.4472136 , 0.4472136 , 0.36514837, ..., 0.36514837, 0.25819889,
        0.36514837],
       [0.4472136 , 0.4472136 , 0.36514837, ..., 0.36514837, 0.25819889,
        0.36514837]])

## b. Collaborative Filtering Recommendation Model by Users

### Use test_train_split to split above dataset with ratio 50/50. The test dataset will be used as groundtruth to evaluate the rating calculated by using the train dataset

In [274]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(ratings, test_size = 0.5, random_state = 123456)

### Create matrix for users, movies and ratings in both training and testing datasets

In [275]:
train_data_matrix = train_data.pivot_table(index='movie_id', columns='user_id',
values='rating').astype('float64')
test_data_matrix = test_data.pivot_table(index='movie_id', columns='user_id',
values='rating').astype('float64')

In [289]:
train_data_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,,3.0,,2.0,5.0,,,4.0,,...,,,4.0,4.0,,3.0,4.0,4.0,4.0,
2,,4.0,,3.0,,,,1.0,,,...,,,,,,,,3.0,,4.0
3,,5.0,4.0,,,4.0,,,5.0,3.0,...,4.0,,,,3.0,,5.0,5.0,,
4,,,,,4.0,,,,,,...,2.0,,,,,,,5.0,,5.0
5,3.0,5.0,3.0,,,,,4.0,,4.0,...,1.0,5.0,,,,1.0,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,3.0,,4.0,4.0,2.0,3.0,,,1.0,2.0,...,2.0,,3.0,,,3.0,5.0,,4.0,4.0
97,4.0,,,,5.0,,,4.0,,4.0,...,4.0,,,5.0,,,,,,
98,,,,,5.0,,,,4.0,,...,,5.0,,5.0,,5.0,4.0,,,5.0
99,,,,4.0,,,,,,,...,4.0,3.0,4.0,,,,4.0,,,5.0


In [290]:
test_data_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,,1.0,,,,,,5.0,...,1.0,,,,3.0,,,,,4.0
2,,,1.0,,4.0,2.0,3.0,,,4.0,...,4.0,5.0,,,4.0,4.0,,,,
3,5.0,,,3.0,4.0,,4.0,3.0,,,...,,5.0,2.0,5.0,,,,,,4.0
4,4.0,,3.0,,,,4.0,,5.0,4.0,...,,4.0,,,3.0,4.0,4.0,,,
5,,,,3.0,,,2.0,,,,...,,,4.0,1.0,,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,,,,,,,5.0,5.0,,,...,,,,,,,,4.0,,
97,,2.0,,,,,5.0,,3.0,,...,,,3.0,,5.0,,5.0,5.0,4.0,4.0
98,5.0,,,1.0,,4.0,,,,2.0,...,3.0,,4.0,,3.0,,,4.0,,
99,,4.0,4.0,,,,3.0,2.0,,4.0,...,,,,,3.0,2.0,,,3.0,


### Calculate the user correlation

#### Helper functions

In [305]:
# Functions

import math

def cosine_similarity(v1,v2, metric='cosine'):
    #metric: cosine or correlation
    if metric == 'correlation':
        v1 = v1 - np.nanmean(v1)
        v2 = v2 - np.nanmean(v2)
    "compute similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        if np.isnan(x) or np.isnan(y): continue
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def sim_matrix(M, dimension='user', metric='cosine'):
    N = M.shape[0] if dimension == 'user' else M.shape[1]
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0 #Cancel out the effect of self-similarity in the sums later
                continue
            if dimension == 'user':
                # If user, vector 1 and vector 2 = current iterations (i, j) of users to compare
                v1, v2 = M[i,:], M[j,:]
            else:
                # If item, vector 1 and vector 2 = current iterations (i, j) of drinks to compare
                v1, v2 = M[:,i], M[:,j]
                
            sim[i][j] = cosine_similarity(v1,v2,metric)
    return sim

def user_cf(M, metric='cosine'):
    pred = np.copy(M)
    n_users, n_items = M.shape
    # Average rating each user gave movies (ignoring nans)
    avg_ratings = np.nanmean(M, axis=1)
    sim_users = sim_matrix(M, 'user', metric)
    
    
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):

                # Can include self sim because 0 will cancel itself
                similarities = sim_users[i]
                item_ratings = M[:, j]
                
                numerator = np.nansum(similarities * (item_ratings - avg_ratings))
                denominator = np.nansum(similarities)
                mean_rating = avg_ratings[i]
                
                pred[i,j] = mean_rating + (numerator / denominator)

    return pred

def item_cf(M, metric='cosine'):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=0)
    sim_items = sim_matrix(M, 'item', metric)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):
                
                similarities = sim_items[j]

                users_other_ratings = M[i,:]

                mean_rating = avg_ratings[j]
                numerator = np.nansum((sim_items[j] * (users_other_ratings - avg_ratings)))
                denominator = np.nansum(sim_items[j])
                
                pred[i,j] = mean_rating + (numerator / denominator)
                
    return pred

In [280]:
# User correlation on training set
train_sim_matrix = sim_matrix(train_data_matrix.values, metric="correlation")
train_sim_matrix

array([[ 0.        ,  0.4231326 ,  0.44835858, ..., -0.38533322,
        -0.22289839, -0.00777007],
       [ 0.4231326 ,  0.        ,  0.05420251, ..., -0.0226774 ,
         0.21959132, -0.49098413],
       [ 0.44835858,  0.05420251,  0.        , ..., -0.44455077,
        -0.45963116, -0.21206669],
       ...,
       [-0.38533322, -0.0226774 , -0.44455077, ...,  0.        ,
         0.52830199, -0.25197265],
       [-0.22289839,  0.21959132, -0.45963116, ...,  0.52830199,
         0.        , -0.00420387],
       [-0.00777007, -0.49098413, -0.21206669, ..., -0.25197265,
        -0.00420387,  0.        ]])

###  Implement a predict based on user correlation coefficient. Predict on train dataset and compare the RMSE with the test dataset

In [287]:
user_cf_predictions = user_cf(train_data_matrix.values, metric='correlation')

In [288]:
user_cf_predictions

array([[ 3.        ,  6.76366058,  3.        , ...,  4.        ,
         4.        , -2.36724172],
       [ 2.80627473,  4.        ,  3.59196911, ...,  3.        ,
         3.52268594,  4.        ],
       [ 1.50663676,  5.        ,  4.        , ...,  5.        ,
         2.21390752,  3.8473933 ],
       ...,
       [ 2.21106587,  2.87866461,  1.60216088, ...,  4.42937736,
         4.67372597,  5.        ],
       [ 3.23805007,  3.80209189,  3.70042734, ...,  3.67423268,
         3.76390872,  5.        ],
       [ 1.        ,  2.81481683,  2.87499252, ...,  3.10059552,
         3.        ,  5.77897328]])

In [304]:
rmse_user_cf = np.nanmean( ((user_cf_predictions - test_data_matrix) ** 2) ) ** .5

In [306]:
item_cf_predictions = item_cf(train_data_matrix.values, metric='correlation')

In [307]:
item_cf_predictions

array([[  3.        ,   5.78253977,   3.        , ...,   4.        ,
          4.        ,   3.07210719],
       [  4.31669089,   4.        ,   4.26248587, ...,   3.        ,
          3.33841486,   4.        ],
       [  3.36766991,   5.        ,   4.        , ...,   5.        ,
          3.98285826,   3.42532185],
       ...,
       [  3.63457486,  30.58568767,   2.50054283, ...,  -0.5734696 ,
          3.88918724,   5.        ],
       [  4.58585508, -24.10580904,   3.42486355, ...,   0.62657072,
          3.76109981,   5.        ],
       [  1.        , -25.36915244,   4.16681702, ...,   0.92284767,
          3.        ,   2.8391143 ]])

In [308]:
rmse_item_cf = np.nanmean( ((item_cf_predictions - test_data_matrix) ** 2) ) ** .5

In [309]:
rmse_item_cf

28.655561131036478

### From the results we can conclude that in this scenario, more accurate results are found when predicting via <br>🏆 user-based 🏆<br>  (correlation) based collaborative filtering