In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
ratings = pd.DataFrame.from_csv('Small/ratings.csv',index_col=None)
movies = pd.DataFrame.from_csv('Small/movies.csv',index_col='movieId')

In [3]:
#Step 1: Remove movies prior to August 2003 and with less ratings than the required threshold
r = ratings
r = r[r.timestamp >= 1059696000] #1059696000 is the UTCTimestamp for Aug 1, 2003 at GMT
r = r.set_index('userId')
r['user_freq'] = r.index.value_counts() #Generate new column to filter data by: Number of ratings by the user
r = r[(r.user_freq <= 2000)&(r.user_freq >= 40)] #Weed out suspect users and those with too few ratings. Lower threshold chosen to ensure user presence in all data partitions.
r = r.reset_index()
r = r.set_index('movieId') #Generate new column to filter data by: number of ratings per movie
r['movie_freq'] = r.index.value_counts()
r = r[r.movie_freq >=35] #Filter out movies with too few ratings. Threshold chosen to ensure user presence in all data partitions.
r = r.reset_index()
r = r.drop(['user_freq','movie_freq','timestamp'],axis=1) #Remove excess data

r.shape

(20816, 3)

In [4]:
#Ensuring that the train/test partition produces datasets that pivot to the same dimensions (include all movies and all users).
r_piv = r.pivot('movieId','userId','rating')
same_shape = False
while same_shape == False:
    train, test = train_test_split(r, train_size = 0.80) #Randomly partitions data into 80-20 training-test split
    trainm = train.pivot('movieId','userId','rating')
    testm = test.pivot('movieId','userId','rating')
    if (trainm.shape == r_piv.shape) & (testm.shape == r_piv.shape):
        same_shape = True

k = 4 #Setting k for k-fold cross validation of the training partition
n = int(len(train)/k) #Establishing size of each fold (except last) for k-fold cross validation

ind = r_piv.index #Storing movieId index for future use
col = r_piv.columns #Storing userId column names fof future use
nmovies, nusers = r_piv.shape #Store dimensions of the data
mu = trainm.mean(axis = 1) #Store average rating per movie: used in mean normalization and RMSE calculations

In [5]:
#Ensuring that all k attempts at k-fold validation will use matrices with matching dimensions
dim_check = False
while dim_check == False:
    shuffled_train = train.reindex(np.random.permutation(train.index)) #Randomize order of data (long form)
    for num in range(0,k):
        #Establish dataset for cross-validation for each k
        if num != k-1:
            cv = shuffled_train[num*n:(num+1)*n]
        else: #Final k may contain different number of entries due to remainder when calculating n
            cv = shuffled_train[num*n:]
        tr = shuffled_train.drop(cv.index) #Training partition without cross-validation: used to train model for each k
        trm = tr.pivot('movieId','userId','rating')
        cvm = cv.pivot('movieId','userId','rating')
        
        #if the dimensions of data matrix for all k values don't match, rerandomize long form data, split, and try again
        if (trm.shape != trainm.shape) | (cvm.shape != trainm.shape):
            dim_check = False
            break #exits for loop early if first few k values don't have matching dimensions
        else:
            dim_check = True

In [6]:
Results = []
for trial in range(1,4): #Three trials to account for random nature of feature initialization
    for nfeatures in [200,250,300,350,400]: #For loop to optimize for number of features used in final models
        for reg in [2.5,5,10]: #For loop to optimize the regularization parameter used in final models
            alpha = .003 #learning rate
            cum_red = 0 #Stores cumulative reduction in RMSE across all k's when doing cross-validation
            for num in range(0,k): #For loop for k-fold cross validation
                X = np.random.randn(nmovies,nfeatures) #Random initialization of movie features
                Theta = np.random.randn(nusers,nfeatures) #Random initialization of user features
                
                #Build training and cross-validation matrices for this k
                if num != k-1:
                    cv = shuffled_train[num*n:(num+1)*n]
                else:
                    cv = shuffled_train[num*n:]
                tr = shuffled_train.drop(cv.index)
                trm = tr.pivot('movieId','userId','rating')
                cvm = cv.pivot('movieId','userId','rating')
                
                trm2 = trm.subtract(mu,axis=0) #Mean normalization: so that the regularization drives non-existant ratings towards the mean rating, rather than 0
                R = np.asarray(~np.isnan(trm2)) #Matrix containing boolean for rated/not rated for each movie user pairing
                trm2 = np.asarray(trm2.fillna(0)) #Replaces missing data with 0's

                delta = 1 #Stores change in each iteration's cost function
            
                J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2))) #First initialization of cost function
                while delta >= 0.001: #Iterates until cost function changes by < 0.1%
                    J_old = J_new
                    X_grad = np.dot(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R),Theta) + reg*X #Determine gradient for all movie features
                    Theta_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(Theta))-trm2,R)),X)+reg*Theta #Determine gradient for all user features
                    X = X-alpha*X_grad #Movie feature update
                    Theta = Theta-alpha*Theta_grad #User feature update
                    J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2))) #Recalculate cost function
                    delta = (J_old-J_new)/J_old #Determine change in cost function
                    if delta <0: #If cost function increased, undo the feature updates, reduce learning rate
                        X = X+alpha*X_grad #Undoes update to movie features
                        Theta = Theta+alpha*Theta_grad #Undoes update to user features
                        J_new = J_old
                        alpha = alpha/2 #Reduces learning rate by half
                        delta=1 #Ensures re-entry into the while loop
                
                Predictions = pd.DataFrame(data = np.dot(X,np.transpose(Theta)),index = ind, columns = col) #Generate normalized rating predictions: dot product of movie and user features
                Predictions = Predictions.add(mu,axis = 0) #Add back mean to undo mean normalization
            
                RMSE_avg = math.sqrt(np.sum(np.sum((cvm.subtract(mu,axis = 0))**2))/len(cv)) #RMSE of comparison model - give each user the movie's average rating
                RMSE_alg = math.sqrt(np.sum(np.sum((Predictions - cvm)**2))/len(cv)) #RMSE of currently trained model
                cum_red += (RMSE_avg - RMSE_alg)/RMSE_avg*100 #Adds % reduction in RMSE for this k to running total
            
            red = cum_red/k #Dividing by k gives the average % rduction in RMSE to be expected from this Trial, number of features, and regularization parameter
                        
            Results.append({'Trial': trial, 'Features': nfeatures, 'Reg. Parameter': reg, '% Reduction in RMSE': red}) #Store results for future inspection

Results = pd.DataFrame(Results)
Results = Results.set_index(['Trial','Features','Reg. Parameter'])
Results = Results.unstack('Trial')

In [7]:
Results

Unnamed: 0_level_0,Unnamed: 1_level_0,% Reduction in RMSE,% Reduction in RMSE,% Reduction in RMSE
Unnamed: 0_level_1,Trial,1,2,3
Features,Reg. Parameter,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
200,2.5,5.189865,5.313059,5.47957
200,5.0,6.580062,6.565382,6.511243
200,10.0,5.362979,5.34086,5.323765
250,2.5,5.85487,5.645883,5.683184
250,5.0,6.670371,6.679894,6.61962
250,10.0,5.352673,5.389483,5.384676
300,2.5,5.881034,6.048329,6.122294
300,5.0,6.707431,6.655936,6.647341
300,10.0,5.358235,5.41375,5.416297
350,2.5,4.904324,5.004651,4.834965


In [8]:
avg_red = np.mean(Results,axis=1) #averaging results across all Trials
avg_red

Features  Reg. Parameter
200       2.5               5.327498
          5.0               6.552229
          10.0              5.342535
250       2.5               5.727979
          5.0               6.656628
          10.0              5.375611
300       2.5               6.017219
          5.0               6.670236
          10.0              5.396094
350       2.5               4.914647
          5.0               5.982419
          10.0              4.947351
400       2.5               5.156570
          5.0               6.084031
          10.0              4.986154
dtype: float64

In [9]:
nfeatures, reg = np.argmax(avg_red) #Determine best pairing of regularization parameter and number of features
nfeatures, reg

(300, 5.0)

In [10]:
trainm2 = trainm.subtract(mu, axis = 0) #Mean normalization of 80% data partition
R = np.asarray(~np.isnan(trainm2)) #Matrix containing boolean for rated/not rated for each movie user pairing
trainm2 = np.asarray(trainm2.fillna(0)) #Replace missing data with 0's

Results = []
for num in range(0,5): #Due to random initializations leading to local minima, we need 5 trials so we can report a more accurate % reduction in RMSE
    X = np.random.randn(nmovies,nfeatures) #Random initialization of movie features
    Theta = np.random.randn(nusers,nfeatures) #Random initialization of user features

    alpha = .003 #learning rate

    delta = 1 #Stores change in each iteration's cost function
    iter = 0 #In case we wish to visualize the decaying magnitude of the cost-function across the upcoming iterations
    J = []
    J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trainm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2))) #First initialization of cost function
    J.append({'Iteration': iter, 'Cost': J_new}) #In case we wish to visualize the decaying magnitude of the cost-function across the upcoming iterations
    while delta >= 0.001:
        J_old = J_new
        iter = iter+1
        X_grad = np.dot(np.multiply(np.dot(X,np.transpose(Theta))-trainm2,R),Theta) + reg*X #Determine gradients for movie features
        Theta_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(Theta))-trainm2,R)),X)+reg*Theta #Determine gradients for user features
        X = X-alpha*X_grad #Movie feature update
        Theta = Theta-alpha*Theta_grad #User feature update
        J_new = np.sum(np.sum(np.multiply(((np.dot(X,np.transpose(Theta))-trainm2)**2),R)))/2 + reg/2*(np.sum(np.sum(Theta**2))+np.sum(np.sum(X**2))) #Recalculating cost function after the update
        J.append({'Iteration': iter, 'Cost': J_new}) #In case we wish to visualize the decaying magnitude of the cost-function across the upcoming iterations
        delta = (J_old-J_new)/J_old #Determine change in cost function
        if delta <0: #If cost function increased, undo the feature updates, reduce learning rate
            X = X+alpha*X_grad
            Theta = Theta+alpha*Theta_grad
            J_new = J_old
            alpha = alpha/2
            delta=1
                
    Predictions = pd.DataFrame(data = np.dot(X,np.transpose(Theta)),index = ind, columns = col) #Generate normalized rating predictions: dot product of movie and user features
    Predictions = Predictions.add(mu,axis = 0) #Add back mean to undo mean normalization
    
    n_obs= len(test)        
    RMSE_avg = math.sqrt(np.sum(np.sum((testm.subtract(mu,axis = 0))**2))/n_obs) #RMSE of comparison model - give each user the movie's average rating
    RMSE_alg = math.sqrt(np.sum(np.sum((Predictions - testm)**2))/n_obs) #RMSE of currently trained model
    red = (RMSE_avg - RMSE_alg)/RMSE_avg*100 #% reduction in RMSE for this trial
            
    Results.append(red) #Store results for future inspection

Results

[7.806876777111736,
 8.03719554718172,
 7.723017575423735,
 7.659904312779712,
 8.01878428993179]

In [11]:
np.mean(Results) #Average % Reduction in RMSE for reporting purposes

7.8491557004857384

In [12]:
my_ratings = pd.DataFrame.from_csv('My Ratings.csv',index_col='movieId') #Loads my movie ratings
my_rat_mean_norm = my_ratings.subtract(mu,axis = 0) #Mean normalization of my ratings
R = np.asarray(~np.isnan(my_rat_mean_norm))
my_rat_mean_norm = np.asarray(my_rat_mean_norm.fillna(0)) #Replaces missing ratings with 0's
my_features = np.random.randn(1,nfeatures) #Random initialization of my user features

In [13]:
alpha = .003 #learning rate for upcoming linear regression

#Optimizing linear regression to learn my user features.
#No trials required because regardless of the initialization, the result should be a global minimum for that regularization parameter.
best_RMSE = 100 #Use RMSE to determine best fit
for reg in [1,2.5,5,10]:
    delta = 1
    J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2) #Linear regression cost function
    while delta >= 0.001:
        J_old = J_new
        iter = iter+1
        my_features_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(my_features))-my_rat_mean_norm,R)),X) + reg*my_features #Gradient for my features
        my_features = my_features - alpha*my_features_grad #Update my user features
        J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2) #Recalculate cost function
        delta = (J_old-J_new)/J_old
    my_predictions = pd.DataFrame(data = np.dot(X,np.transpose(my_features)),index = ind) #Generate normalized predictions based on my features
    my_predictions = my_predictions.add(mu,axis = 0) #Re-add mean to undo mean normalization
    RMSE = np.sqrt(np.sum((my_predictions-my_ratings.values)**2)/np.sum(R)).values #Determine RMSE to compare against other models
    #Store best regularization parameter
    if RMSE < best_RMSE:
        best_RMSE = RMSE
        best_reg = reg

#Re-determine my features using the best regularization parameter
reg = best_reg
delta = 1
J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2) #Linear regression cost function
while delta >= 0.001:
    J_old = J_new
    iter = iter+1
    my_features_grad = np.dot(np.transpose(np.multiply(np.dot(X,np.transpose(my_features))-my_rat_mean_norm,R)),X) + reg*my_features #Gradient for my features
    my_features = my_features - alpha*my_features_grad #Update my user features
    J_new = np.sum(np.multiply(((np.dot(X,np.transpose(my_features))-my_rat_mean_norm)**2),R))/2 + reg/2*np.sum(my_features**2) #Recalculate cost function
    delta = (J_old-J_new)/J_old
my_predictions = pd.DataFrame(data = np.dot(X,np.transpose(my_features)),index = ind,columns=['Predicted Rating']) #Generate normalized predictions based on my features
my_predictions = my_predictions.add(mu,axis = 0).merge(movies,left_index = True, right_index = True) #Re-add mean to undo mean normalization and add in movie titles
my_predictions.sort_values(by='Predicted Rating',ascending=False).head(10) #Sort my predicted ratings in descending order

Unnamed: 0_level_0,Predicted Rating,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
318,5.100394,"Shawshank Redemption, The (1994)",Crime|Drama
48516,5.062102,"Departed, The (2006)",Crime|Drama|Thriller
2959,5.032985,Fight Club (1999),Action|Crime|Drama|Thriller
50,4.928823,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
58559,4.919431,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
79132,4.888581,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
3275,4.887415,"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller
7361,4.883088,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
68157,4.876789,Inglourious Basterds (2009),Action|Drama|War
47,4.846612,Seven (a.k.a. Se7en) (1995),Mystery|Thriller


In [14]:
#PCA on movie features: used to make features linearly uncorrelated. Also results in dimensionality reduction and data compression.
var_explained = 0
iter = 0
while var_explained < 0.95: #Want to retain 95% of variance in movie features
    iter += 1 #Increase number of components
    pca = PCA(n_components=iter)
    movie_pca = pd.DataFrame(pca.fit_transform(X), index = ind) #Determine and store principal components for this iteration
    var_explained = np.sum(pca.explained_variance_ratio_) #Determine how much of the variance was retained after PCA

#PCA on user features: used to make features linearly uncorrelated. Also results in dimensionality reduction and data compression. Goes unused for now.
var_explained = 0
iter = 0
while var_explained < 0.95:
    iter += 1
    pca = PCA(n_components=iter)
    user_pca = pd.DataFrame(pca.fit_transform(Theta), index = col)
    var_explained = np.sum(pca.explained_variance_ratio_)

In [15]:
#Normalize movie features from 0 to 1, a.k.a. Feature Scaling
movie_pca_norm = (movie_pca-movie_pca.min(axis = 0))/(movie_pca.max(axis = 0) - movie_pca.min(axis =0))

#Store Normalized Euclidean distance from every other movies based on principal components of the learned features
dist = []
for num1 in range(0,nmovies):
    for num2 in range(0,nmovies):
        dist.append({'movieId1': ind[num1], 'movieId2': ind[num2], 'distance':math.sqrt(np.sum((movie_pca_norm[movie_pca_norm.index == ind[num1]].values-movie_pca_norm[movie_pca_norm.index == ind[num2]].values)**2))})

#Establish easily queriable collection of distances
d = pd.DataFrame(dist)
d = d.set_index(['movieId1','movieId2'])
d = d[d.distance != 0] #Removes pairing that contains 2 instances of the same movie

In [16]:
d_mov = 4896 #movieId from example (Harry Potter and the Philosopher's Stone)
closest = d.loc[d_mov].sort_values('distance') #Find 10 'nearest' movies, only contains movieId
closest = pd.merge(closest,movies,left_index = True, right_index = True).sort_values('distance') #Add title to make the recommendation clearer
closest.distance = closest.distance/closest.distance.max(axis = 0) #reduces range of existing distances to from 0 to 1 for easier understanding of weights applied to other scores

print("Movies closest to %s:" %movies.loc[d_mov].title)
closest.head(10)

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001):


Unnamed: 0,distance,title,genres
4262,0.658669,Scarface (1983),Action|Crime|Drama
5816,0.667807,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,0.701819,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
49272,0.705333,Casino Royale (2006),Action|Adventure|Thriller
55765,0.705663,American Gangster (2007),Crime|Drama|Thriller
2001,0.706265,Lethal Weapon 2 (1989),Action|Comedy|Crime|Drama
30707,0.707496,Million Dollar Baby (2004),Drama
36529,0.714837,Lord of War (2005),Action|Crime|Drama|Thriller|War
1617,0.715301,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
81845,0.716941,"King's Speech, The (2010)",Drama


In [17]:
filter_applied = 'Adventure'

closest_filt = d.loc[d_mov].sort_values('distance') #Find 10 'nearest' movies, only contains movieId
closest_filt = pd.merge(closest_filt,movies,left_index = True, right_index = True).sort_values('distance') #Add title to make the recommendation clearer
closest_filt.distance = closest_filt.distance/closest_filt.distance.max(axis = 0) #reduces range of existing distances to from 0 to 1 for easier understanding of weights applied to other scores
closest_filt = closest_filt[closest_filt.genres.str.contains(filter_applied)] #Applying a filter to the results

print("Movies closest to %s" %movies.loc[d_mov].title)
print("Filtered for %s movies:" %filter_applied)
closest_filt.head(10)

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Filtered for Adventure movies:


Unnamed: 0,distance,title,genres
5816,0.667807,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,0.701819,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
49272,0.705333,Casino Royale (2006),Action|Adventure|Thriller
91529,0.721531,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
1127,0.725988,"Abyss, The (1989)",Action|Adventure|Sci-Fi|Thriller
1370,0.731256,Die Hard 2 (1990),Action|Adventure|Thriller
2,0.748839,Jumanji (1995),Adventure|Children|Fantasy
3000,0.749304,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy
3114,0.754424,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2890,0.754742,Three Kings (1999),Action|Adventure|Comedy|Drama|War


In [18]:
filter_applied = 'Fantasy'
filter2 = 'Adventure'

closest_filt = d.loc[d_mov].sort_values('distance') #Find 10 'nearest' movies, only contains movieId
closest_filt = pd.merge(closest_filt,movies,left_index = True, right_index = True).sort_values('distance') #Add title to make the recommendation clearer
closest_filt.distance = closest_filt.distance/closest_filt.distance.max(axis = 0) #reduces range of existing distances to from 0 to 1 for easier understanding of weights applied to other scores
closest_filt = closest_filt[closest_filt.genres.str.contains(filter_applied)] #Applying a filter to the results
closest_filt = closest_filt[closest_filt.genres.str.contains(filter2)] #Applying a filter to the results

print("Movies closest to %s" %movies.loc[d_mov].title)
print("Filtered for %s and %s movies:" %(filter_applied,filter2))
closest_filt.head(10)

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Filtered for Fantasy and Adventure movies:


Unnamed: 0,distance,title,genres
5816,0.667807,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,0.701819,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
2,0.748839,Jumanji (1995),Adventure|Children|Fantasy
3000,0.749304,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy
3114,0.754424,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
3052,0.776401,Dogma (1999),Adventure|Comedy|Fantasy
2115,0.788015,Indiana Jones and the Temple of Doom (1984),Action|Adventure|Fantasy
8368,0.80399,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
919,0.824148,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical
2005,0.825043,"Goonies, The (1985)",Action|Adventure|Children|Comedy|Fantasy


In [48]:
#Genre Similarity: What % of genres in the movie of interest are also found in the recommended movies
weight = 0.1 #For fine tuning results to user preference

#Get list of genres for the movie of interest
text = movies.loc[d_mov].genres
gens = []
while '|' in text:
    g,sep,text = text.partition('|')
    gens.append(g)
gens.append(text)
if 'IMAX' in gens:
    gens.remove('IMAX')

closest_gensim = d.loc[d_mov] #Find 'nearest' movies, only contains movieId
closest_gensim = pd.merge(closest_gensim,movies,left_index = True, right_index = True) #Add title to make the recommendation clearer

#Determine weighted genre similarity score
simscore = []
for num in range(0,len(closest_gensim)):
    sim = 0
    for num2 in range(0,len(gens)):
        if gens[num2] in closest_gensim.genres[closest_gensim.index[num]]:
            sim += 1
    simscore.append(sim/len(gens)*weight)

closest_gensim.distance = closest_gensim.distance/closest_gensim.distance.max(axis = 0) #reduces range of existing distances to from 0 to 1 for easier understanding of weights applied to other scores
closest_gensim.distance = closest_gensim.distance.subtract(simscore)
print("Movies closest to %s" %movies.loc[d_mov].title)
print("Inclusive of genre similarity scores (Weight = %f):" %weight)
closest_gensim.sort_values('distance').head(10)

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Inclusive of genre similarity scores (Weight = 0.100000):


Unnamed: 0,distance,title,genres
5816,0.601141,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,0.635152,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
2,0.648839,Jumanji (1995),Adventure|Children|Fantasy
3114,0.654424,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
4262,0.658669,Scarface (1983),Action|Crime|Drama
551,0.668569,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical
49272,0.672,Casino Royale (2006),Action|Adventure|Thriller
3000,0.682637,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy
91529,0.688198,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
1127,0.692655,"Abyss, The (1989)",Action|Adventure|Sci-Fi|Thriller


In [20]:
#Genre Overlap Score: Using the genre heatmap, determine a genre overlap score. Average of genre overlaps between the genres of the movie of interest and recommended movies across all genres in both movies

#Generate genre edge graph/heatmap

movies2 = pd.DataFrame.from_csv('Full/movies.csv',index_col='movieId')

#Form full list of genres
genres = []
for num in movies2.index:
    text = movies2.genres[num]
    while '|' in text:
        g,sep,text = text.partition('|')
        if g not in genres:
            genres.append(g)
    if text not in genres:
        genres.append(text)
    if len(genres) == 20: #Know there are 20 genres across the dataset
        break #Exit loop early once listed is compiled
genres.sort()
genres.remove('(no genres listed)')
genres.remove('IMAX')

edge = []

#Generate edge graph/heatmap
for gen1 in range(0,len(genres)):
    for gen2 in range(0,len(genres)):
        #Determine how many movies belong to both Genre1 and Genre2
        edge.append({'Genre1': genres[gen1], 'Genre2': genres[gen2], '': len(movies2[movies2.genres.str.contains(genres[gen1])&movies2.genres.str.contains(genres[gen2])])})
edge = pd.DataFrame(edge)
edge = edge.pivot('Genre1','Genre2','')
edge = np.round(edge/edge.max(),2).T #edge.max = Total movies in Genre1 and Genre2=Genre1 <- Total movies that belong to Genre1
edge

Genre1,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Action,1.0,0.26,0.06,0.03,0.2,0.21,0.0,0.34,0.09,0.0,0.08,0.01,0.04,0.07,0.18,0.32,0.08,0.04
Adventure,0.42,1.0,0.16,0.21,0.25,0.06,0.02,0.29,0.21,0.0,0.04,0.03,0.03,0.12,0.17,0.13,0.07,0.04
Animation,0.19,0.31,1.0,0.46,0.32,0.03,0.01,0.12,0.23,0.0,0.03,0.09,0.03,0.06,0.14,0.03,0.02,0.01
Children,0.09,0.36,0.39,1.0,0.43,0.02,0.01,0.21,0.25,0.0,0.01,0.09,0.03,0.07,0.07,0.01,0.0,0.01
Comedy,0.09,0.07,0.04,0.07,1.0,0.08,0.01,0.29,0.05,0.0,0.05,0.05,0.03,0.22,0.04,0.04,0.01,0.01
Crime,0.27,0.05,0.01,0.01,0.22,1.0,0.01,0.58,0.01,0.05,0.05,0.01,0.16,0.08,0.02,0.43,0.01,0.01
Documentary,0.01,0.02,0.01,0.0,0.05,0.02,1.0,0.05,0.0,0.0,0.01,0.03,0.0,0.0,0.0,0.01,0.03,0.0
Drama,0.09,0.05,0.01,0.02,0.19,0.13,0.01,1.0,0.03,0.02,0.04,0.02,0.05,0.19,0.03,0.14,0.06,0.01
Fantasy,0.22,0.34,0.19,0.24,0.33,0.02,0.0,0.28,1.0,0.0,0.14,0.07,0.08,0.17,0.14,0.09,0.01,0.0
Film-Noir,0.04,0.01,0.01,0.0,0.03,0.55,0.0,0.72,0.01,1.0,0.02,0.01,0.2,0.1,0.02,0.37,0.01,0.0


In [68]:
weight = 0.25

text = movies.loc[d_mov].genres
gens = []
while '|' in text:
    g,sep,text = text.partition('|')
    gens.append(g)
gens.append(text)
if 'IMAX' in gens:
    gens.remove('IMAX')

closest_gensim2 = d.loc[d_mov] #Find 'nearest' movies, only contains movieId
closest_gensim2 = pd.merge(closest_gensim2,movies,left_index = True, right_index = True) #Add title to make the recommendation clearer

#Determine weighted genre similarity score
simscore = []
for num in range(0,len(closest_gensim2)):
    n_edge = 0 #number of edge values counted 
    sum_edge = 0 #sum of edge values
    for num2 in range(0,len(gens)):
        for num3 in range(0,len(genres)):
            if edge.columns[num3] in closest_gensim2.genres[closest_gensim2.index[num]]:
                n_edge += 1
                sum_edge += edge.loc[gens[num2],edge.columns[num3]]
    simscore.append(sum_edge/n_edge*weight)


closest_gensim2.distance = closest_gensim2.distance/closest_gensim2.distance.max(axis = 0) #reduces range of existing distances to from 0 to 1 for easier understanding of weight applied to genre similarity score
closest_gensim2.distance = closest_gensim2.distance.subtract(simscore)
print("Movies closest to %s" %movies.loc[d_mov].title)
print("Inclusive of the genre overlap score (Weight = %f):" %weight)
closest_gensim2.sort_values('distance').head(10)

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Inclusive of the genre overlap score (Weight = 0.250000):


Unnamed: 0,distance,title,genres
5816,0.536141,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,0.607652,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
4262,0.613946,Scarface (1983),Action|Crime|Drama
2,0.620783,Jumanji (1995),Adventure|Children|Fantasy
49272,0.631444,Casino Royale (2006),Action|Adventure|Thriller
3421,0.641307,Animal House (1978),Comedy
30707,0.642496,Million Dollar Baby (2004),Drama
3114,0.648424,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
91529,0.651253,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
2174,0.651671,Beetlejuice (1988),Comedy|Fantasy


In [78]:
#Release Year Penalty: Adding the (differnce in release year)*weight to distance will more heavily penalize movies with drastically different release years
weight = 0.005

text = movies.loc[d_mov].title
while "(" in text:
    garbage,sep,text = text.partition('(')
rel_yr = int(text[:4])

closest_yr = d.loc[d_mov] #Find 'nearest' movies, only contains movieId
closest_yr = pd.merge(closest_yr,movies,left_index = True, right_index = True) #Add title to make the recommendation clearer

yr_dif = []
for num in range(0,len(closest_yr)):
    text = closest_yr.title[closest_yr.index[num]]
    while "(" in text:
        garbage,sep,text = text.partition('(')
    yr_dif.append(abs(rel_yr-int(text[:4]))*weight)

closest_yr.distance = closest_yr.distance/closest_yr.distance.max(axis = 0) #reduces range of existing distances to from 0 to 1 for easier understanding of weight applied to genre similarity score
closest_yr.distance = closest_yr.distance.add(yr_dif)
print("Movies closest to %s" %movies.loc[d_mov].title)
print("Inclusive of the release year penalty (Weight =%f):" %weight)
closest_yr.sort_values('distance').head(10)

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Inclusive of the release year penalty (Weight =0.005000):


Unnamed: 0,distance,title,genres
5816,0.672807,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,0.721819,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
30707,0.722496,Million Dollar Baby (2004),Drama
3481,0.722917,High Fidelity (2000),Comedy|Drama|Romance
49272,0.730333,Casino Royale (2006),Action|Adventure|Thriller
5989,0.732031,Catch Me If You Can (2002),Crime|Drama
36529,0.734837,Lord of War (2005),Action|Crime|Drama|Thriller|War
1617,0.735301,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
55765,0.735663,American Gangster (2007),Crime|Drama|Thriller
3977,0.73738,Charlie's Angels (2000),Action|Comedy


In [93]:
weight_overlap = 0.3
weight_yr = 0.01

text = movies.loc[d_mov].genres
gens = []
while '|' in text:
    g,sep,text = text.partition('|')
    gens.append(g)
gens.append(text)
if 'IMAX' in gens:
    gens.remove('IMAX')

closest_combo = d.loc[d_mov] #Find 'nearest' movies, only contains movieId
closest_combo = pd.merge(closest_combo,movies,left_index = True, right_index = True) #Add title to make the recommendation clearer

#Determine weighted genre overlap score
simscore = []
for num in range(0,len(closest_combo)):
    n_edge = 0 #number of edge values counted 
    sum_edge = 0 #sum of edge values
    for num2 in range(0,len(gens)):
        for num3 in range(0,len(genres)):
            if edge.columns[num3] in closest_combo.genres[closest_combo.index[num]]:
                n_edge += 1
                sum_edge += edge.loc[gens[num2],edge.columns[num3]]
    simscore.append(sum_edge/n_edge*weight_overlap)
    
text = movies.loc[d_mov].title
while "(" in text:
    garbage,sep,text = text.partition('(')
rel_yr = int(text[:4])

yr_dif = []
for num in range(0,len(closest_combo)):
    text = closest_combo.title[closest_combo.index[num]]
    while "(" in text:
        garbage,sep,text = text.partition('(')
    yr_dif.append(abs(rel_yr-int(text[:4]))*weight_yr)


closest_combo.distance = closest_combo.distance/closest_combo.distance.max(axis = 0) #reduces range of existing distances to from 0 to 1 for easier understanding of weight applied to genre similarity score
closest_combo.distance = closest_combo.distance.subtract(simscore)
closest_combo.distance = closest_combo.distance.add(yr_dif)
print("Movies closest to %s" %movies.loc[d_mov].title)
print("Inclusive of the genre overlap score (Weight = %f) and release year penalty (Weight = %f):" %(weight_overlap, weight_yr))
closest_combo.sort_values('distance').head(10)

Movies closest to Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Inclusive of the genre overlap score (Weight = 0.300000) and release year penalty (Weight = 0.010000):


Unnamed: 0,distance,title,genres
5816,0.519807,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,0.628819,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
3114,0.647224,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2,0.655172,Jumanji (1995),Adventure|Children|Fantasy
3977,0.65538,Charlie's Angels (2000),Action|Comedy
3481,0.65625,High Fidelity (2000),Comedy|Drama|Romance
3052,0.657401,Dogma (1999),Adventure|Comedy|Fantasy
30707,0.659496,Million Dollar Baby (2004),Drama
49272,0.666667,Casino Royale (2006),Action|Adventure|Thriller
8368,0.67599,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
