In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
import ast
from scipy.sparse import hstack

# movies = pd.read_csv("../../Data/serendipity-sac2018/movies.csv", error_bad_lines=False)
movies = pd.read_csv("../Intermediate_data/filtered_movies_genre.csv", error_bad_lines=False)
movies.drop("Unnamed: 0", axis=1, inplace=True)
imdbData = pd.read_csv("../Intermediate_data/IMDb_cldata.csv")
imdbData.drop("Unnamed: 0", axis=1, inplace=True)
print(len(movies))
movies.head()


22351


Unnamed: 0,movieId,title,releaseDate,directedBy,starring,imdbId,tmdbId,genres,Action,Adventure,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,nan
0,1626,Fire Down Below (1997),1997-09-05,Félix Enríquez Alcalá,"Steven Seagal, Marg Helgenberger, Stephen Lang...",119123,14289.0,"['Action', 'Drama', 'Thriller']",1,0,...,0,0,0,0,0,0,1,0,0,0
1,106491,47 Ronin (2013),2013-12-25,Carl Rinsch,"Keanu Reeves, Hiroyuki Sanada, Kô Shibasaki, R...",1335975,64686.0,"['Action', 'Adventure', 'Fantasy']",1,1,...,0,0,0,0,0,0,0,0,0,0
2,85414,Source Code (2011),2011-04-01,Duncan Jones,"Jake Gyllenhaal, Michelle Monaghan, Vera Farmi...",945513,45612.0,"['Action', 'Drama', 'Mystery', 'Sci-Fi', 'Thri...",1,0,...,0,0,0,1,0,1,1,0,0,0
3,124805,Venus & Vegas (2010),2010-11-01,Demian Lichtenstein,"Eddie Kaye Thomas,Donald Faison,Eddie Guerra,J...",423474,68716.0,"['Action', 'Comedy']",1,0,...,0,0,0,0,0,0,0,0,0,0
4,144514,Quick (2011),2011-07-21,Beom-gu Cho,"Yoo Seung-Mok,Yoon Je-moon,Kim Tae-woo,Lee Min...",2007413,77175.0,"['Action', 'Comedy', 'Thriller']",1,0,...,0,0,0,0,0,0,1,0,0,0


In [2]:
allMovies = movies[["movieId", "imdbId", "title"]].merge(imdbData, on = "imdbId", how = "left")
allMovies.head()

Unnamed: 0,movieId,imdbId,title,plot,rating,runtimes,votes,country codes_us,country codes_gb,country codes_fr,...,certificates_G,certificates_NR,certificates_PG,certificates_R,kind_movie,kind_series,year_1800s,year_1900s,year_1950s,year_2000s
0,1626,119123,Fire Down Below (1997),"[u'Going undercover in rural Kentucky, an envi...",5.1,105.0,16046.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,106491,1335975,47 Ronin (2013),[u'A band of samurai set out to avenge the dea...,6.2,128.0,140571.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,85414,945513,Source Code (2011),"[u""A soldier wakes up in someone else's body a...",7.5,93.0,465374.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,124805,423474,Venus & Vegas (2010),[u'When three Vegas buddies attempt the score ...,4.0,95.0,485.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,144514,2007413,Quick (2011),[u'A famous biker named Gi-su (Lee Min-ki) has...,5.8,115.0,952.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [3]:
train = pd.read_csv("../Intermediate_data/train_subset.csv")
train.drop("Unnamed: 0", axis=1, inplace=True)
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,year
0,127137,115824,4.5,2017-12-31 23:59:50,2017
1,142748,2762,5.0,2017-12-31 23:59:41,2017
2,196198,1198,3.5,2017-12-31 23:59:17,2017
3,142748,71057,4.5,2017-12-31 23:58:09,2017
4,142748,152077,5.0,2017-12-31 23:57:32,2017


In [4]:
userCluster = pd.read_csv("../Intermediate_data/user_clusters.csv")

train = train.merge(userCluster[["userId", "cluster"]], on = "userId", how = "left")
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,cluster
0,127137,115824,4.5,2017-12-31 23:59:50,2017,1.0
1,142748,2762,5.0,2017-12-31 23:59:41,2017,1.0
2,196198,1198,3.5,2017-12-31 23:59:17,2017,2.0
3,142748,71057,4.5,2017-12-31 23:58:09,2017,1.0
4,142748,152077,5.0,2017-12-31 23:57:32,2017,1.0


In [5]:
featureImportances = pd.read_csv("../Intermediate_data/featureImportance.csv")
colsToScale = ["runtimes", "votes"]
scaler = MinMaxScaler()
allMovies[colsToScale] = scaler.fit_transform(allMovies[colsToScale])

In [9]:
def getSimilarityScore(movie1, movie2, similarFeatures, dissimilarFeatures):
    
    d1 = np.linalg.norm(movie1[:len(similarFeatures)] - movie2[:len(similarFeatures)], ord=1)
    d2 = np.linalg.norm(movie1[len(similarFeatures):] - movie2[len(similarFeatures):], ord=1)
    ## To avoid division by 0,
    d2 += 1e-3
    similarityScore = (d1 + (1/d2))
    return similarityScore

def setupSimilarityMatirx(likedMovies, allMovies, similarFeatures, dissimilarFeatures):
    similarityMatrix = np.zeros(shape=(len(likedMovies), len(allMovies)))
    data1 = np.array(likedMovies[np.append(similarFeatures, dissimilarFeatures)])
    data2 = np.array(allMovies[np.append(similarFeatures, dissimilarFeatures)])
    for i in range(len(likedMovies)):
        similarityScores = []
        for j in range(len(allMovies)):
            similarityScores.append(getSimilarityScore(data1[i,:], data2[j,:], 
               similarFeatures, dissimilarFeatures))
        similarityMatrix[i,:] = similarityScores
    return similarityMatrix

def getRelevantMovies(likedMovies, similarityMatrix, top_n = 10):
#     print(similarityMatrix.shape)
    indices = similarityMatrix.ravel().argsort()
    indices = indices[:top_n]%similarityMatrix.shape[1]
    return np.unique(indices)

def getLikedMovies(userMovieRatings, allMovies, ratingThreshold = 3.5, numMoviesToReturn = 20):
    ## Returning top 10 liked movies. These 10 movies will be used to compute the similarity scores.,
    ,
    likedMovieIds = userMovieRatings.loc[userMovieRatings["rating"] >= ratingThreshold, "movieId"]
    likedMovies = allMovies[allMovies["movieId"].apply(lambda x: x in np.unique(likedMovieIds))]
    return likedMovies.head(numMoviesToReturn)

def getPredictions(userMovieRatings, allMovies, similarFeatures, dissimilarFeatures):
# similarFeatures = [runtimes, votes],
# dissimilarFeatures = [country_codes_us, country_codes_gb, country_codes_fr, country_codes_ca],
    likedMovies = getLikedMovies(userMovieRatings, allMovies)
    similarityMatrix = setupSimilarityMatirx(likedMovies, allMovies,
     similarFeatures, dissimilarFeatures)
    predictionIndices = getRelevantMovies(likedMovies, similarityMatrix)
    
    return allMovies.loc[predictionIndices,:], likedMovies
,
def getImportantFeatures(userCluster, featureImportances, numSimilarFeatures = 13):
    featureImportances = featureImportances[featureImportances["cluster"] == userCluster]
    similarFeatures = np.array(featureImportances["feature"])[:numSimilarFeatures]
    dissimilarFeatures = np.array(featureImportances["feature"])[numSimilarFeatures:]
    return (similarFeatures, dissimilarFeatures)


In [10]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,cluster
0,127137,115824,4.5,2017-12-31 23:59:50,2017,1.0
1,142748,2762,5.0,2017-12-31 23:59:41,2017,1.0
2,196198,1198,3.5,2017-12-31 23:59:17,2017,2.0
3,142748,71057,4.5,2017-12-31 23:58:09,2017,1.0
4,142748,152077,5.0,2017-12-31 23:57:32,2017,1.0


In [None]:
ratingsPredicted = []
allPredictions = pd.DataFrame()

n_users = len(train["userId"].unique())
for i, userId in enumerate(train["userId"].unique()[:n_users]):
    if i%1000 == 0:
        print(str(i) + "/" + str(n_users))
    user1 = train[train["userId"] == userId]
    (similarFeatures, dissimilarFeatures) = getImportantFeatures(user1["cluster"].unique()[0], featureImportances)
    predictions, likedMovies = getPredictions(user1,allMovies,similarFeatures, dissimilarFeatures)
    predictions = predictions[["movieId", "title"]].merge(user1[["movieId", "rating"]], on = "movieId", how = "left")
    predictions["userId"] = userId
    allPredictions = allPredictions.append(predictions)
    ratingsPredicted.append(np.array(predictions["rating"]))
    

0/19657
1000/19657
2000/19657
3000/19657


In [None]:
allPredictions.shape