In [2]:
import pandas as pd
import numpy as np
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
import ast
from scipy.sparse import hstack

In [3]:
movies = pd.read_csv("../../Data/serendipity-sac2018/movies.csv", error_bad_lines=False)
imdbData = pd.read_csv("../Intermediate_data/IMDb_cldata.csv")
imdbData.drop("Unnamed: 0", axis=1, inplace=True)
movies.head()

b'Skipping line 19833: expected 8 fields, saw 10\nSkipping line 34143: expected 8 fields, saw 9\nSkipping line 36015: expected 8 fields, saw 10\nSkipping line 37260: expected 8 fields, saw 12\nSkipping line 44379: expected 8 fields, saw 10\nSkipping line 47551: expected 8 fields, saw 10\n'


Unnamed: 0,movieId,title,releaseDate,directedBy,starring,imdbId,tmdbId,genres
0,1,Toy Story (1995),1995-11-19,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",114709,862.0,"Adventure,Animation,Children,Comedy,Fantasy"
1,2,Jumanji (1995),1995-12-15,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",113497,8844.0,"Adventure,Children,Fantasy"
2,3,Grumpier Old Men (1995),1995-01-01,Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",113228,15602.0,"Comedy,Romance"
3,4,Waiting to Exhale (1995),1996-01-15,Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",114885,31357.0,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II (1995),1995-12-08,Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",113041,11862.0,Comedy


In [4]:
allMovies = movies[["movieId", "imdbId", "title"]].merge(imdbData, on = "imdbId", how = "left")
allMovies.head()

Unnamed: 0,movieId,imdbId,title,plot,rating,runtimes,votes,country codes_us,country codes_gb,country codes_fr,...,certificates_G,certificates_NR,certificates_PG,certificates_R,kind_movie,kind_series,year_1800s,year_1900s,year_1950s,year_2000s
0,1,114709,Toy Story (1995),"[u""A cowboy doll is profoundly threatened and ...",8.3,81.0,837115.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2,113497,Jumanji (1995),[u'When two kids find and play a magical board...,7.0,104.0,287007.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,3,113228,Grumpier Old Men (1995),"[u""John and Max resolve to save their beloved ...",6.7,101.0,23107.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,4,114885,Waiting to Exhale (1995),"[u""Based on Terry McMillan's novel, this film ...",5.9,124.0,9034.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,5,113041,Father of the Bride Part II (1995),[u'George Banks must deal not only with the pr...,6.0,106.0,32568.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [5]:
# allMovies.columns = [col.replace(" ","_") for col in allMovies.columns]
# allMovies.columns

In [6]:
train = pd.read_csv("../Intermediate_data/train_subset.csv")
train.drop("Unnamed: 0", axis=1, inplace=True)
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,year
0,127137,115824,4.5,2017-12-31 23:59:50,2017
1,142748,2762,5.0,2017-12-31 23:59:41,2017
2,196198,1198,3.5,2017-12-31 23:59:17,2017
3,142748,71057,4.5,2017-12-31 23:58:09,2017
4,142748,152077,5.0,2017-12-31 23:57:32,2017


In [7]:
userCluster = pd.read_csv("../Intermediate_data/user_clusters.csv")
userCluster.head()

Unnamed: 0.1,Unnamed: 0,cluster,userId
0,0,3,100032
1,1,3,100036
2,2,3,100053
3,3,2,100057
4,4,3,100058


In [8]:
train = train.merge(userCluster[["userId", "cluster"]], on = "userId", how = "left")
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,cluster
0,127137,115824,4.5,2017-12-31 23:59:50,2017,1.0
1,142748,2762,5.0,2017-12-31 23:59:41,2017,1.0
2,196198,1198,3.5,2017-12-31 23:59:17,2017,2.0
3,142748,71057,4.5,2017-12-31 23:58:09,2017,1.0
4,142748,152077,5.0,2017-12-31 23:57:32,2017,1.0


In [9]:
featureImportances = pd.read_csv("../Intermediate_data/featureImportance.csv")

In [10]:
colsToScale = ["runtimes", "votes"]

scaler = MinMaxScaler()
allMovies[colsToScale] = scaler.fit_transform(allMovies[colsToScale])


In [11]:
def getSimilarityScore(movie1, movie2, similarFeatures, dissimilarFeatures):

    d1 = np.linalg.norm(movie1[:len(similarFeatures)] - movie2[:len(similarFeatures)], ord=1)
    d2 = np.linalg.norm(movie1[len(similarFeatures):] - movie2[len(similarFeatures):], ord=1)
    
    ## To avoid division by 0
    d2 += 1e-3
    similarityScore = (d1 + (1/d2))
    return similarityScore

def setupSimilarityMatirx(likedMovies, allMovies, similarFeatures, dissimilarFeatures):
    similarityMatrix = np.zeros(shape=(len(likedMovies), len(allMovies)))
    data1 = np.array(likedMovies[np.append(similarFeatures, dissimilarFeatures)])
    data2 = np.array(allMovies[np.append(similarFeatures, dissimilarFeatures)])
    for i in range(len(likedMovies)):
        if i%20 == 0:
            print(str(i) + "/" + str(len(likedMovies)))
        similarityScores = []
        for j in range(len(allMovies)):
            similarityScores.append(getSimilarityScore(data1[i,:], data2[j,:], 
                                                       similarFeatures, dissimilarFeatures))                
        similarityMatrix[i,:] = similarityScores
    return similarityMatrix

def getRelevantMovies(likedMovies, similarityMatrix, top_n = 10):
    indices = similarityMatrix.ravel().argsort()
    indices = indices[-top_n:]%similarityMatrix.shape[1]
    return np.unique(indices)

def getLikedMovies(userMovieRatings, allMovies, ratingThreshold = 3.5, numMoviesToReturn = 10):
    ## Returning top 10 liked movies. These 10 movies will be used to compute the similarity scores.
    
    likedMovieIds = userMovieRatings.loc[userMovieRatings["rating"] >= ratingThreshold, "movieId"]
    likedMovies = allMovies[allMovies["movieId"].apply(lambda x: x in np.unique(likedMovieIds))]
    return likedMovies.head(numMoviesToReturn)

def getPredictions(userMovieRatings, allMovies, similarFeatures, dissimilarFeatures):
#     similarFeatures = ["runtimes", "votes"]
#     dissimilarFeatures = ["country_codes_us", "country_codes_gb", "country_codes_fr", "country_codes_ca"]
    likedMovies = getLikedMovies(userMovieRatings, allMovies)
    similarityMatrix = setupSimilarityMatirx(likedMovies, allMovies, 
                                             similarFeatures, dissimilarFeatures)
    predictionIndices = getRelevantMovies(likedMovies, similarityMatrix)
    
    return allMovies.loc[predictionIndices,:], likedMovies

def getImportantFeatures(userCluster, featureImportances, numSimilarFeatures = 10):
    featureImportances = featureImportances[featureImportances["cluster"] == userCluster]
    similarFeatures = np.array(featureImportances["feature"])[:numSimilarFeatures]
    dissimilarFeatures = np.array(featureImportances["feature"])[numSimilarFeatures:]
    return (similarFeatures, dissimilarFeatures)
    
    

In [12]:
user1 = train[train["userId"] == 127137]
user1

Unnamed: 0,userId,movieId,rating,timestamp,year,cluster
0,127137,115824,4.5,2017-12-31 23:59:50,2017,1.0
3699,127137,1267,4.5,2017-12-31 06:38:16,2017,1.0
10012,127137,81257,4.5,2017-12-30 07:28:47,2017,1.0
14921,127137,175981,3.0,2017-12-29 10:06:27,2017,1.0
15582,127137,96728,3.0,2017-12-29 07:32:48,2017,1.0
...,...,...,...,...,...,...
1680102,127137,3168,1.0,2017-02-17 06:21:26,2017,1.0
1690612,127137,6957,4.5,2017-02-15 08:08:12,2017,1.0
1690622,127137,134214,2.5,2017-02-15 08:07:10,2017,1.0
1911990,127137,5004,3.5,2017-01-15 06:37:05,2017,1.0


In [13]:
(similarFeatures, dissimilarFeatures) = getImportantFeatures(user1["cluster"].unique()[0], featureImportances)

In [14]:
predictions, likedMovies = getPredictions(user1,allMovies,similarFeatures, dissimilarFeatures)

0/10


In [15]:
predictions[["movieId", "title"]]

Unnamed: 0,movieId,title
43056,167956,How to Win the US Presidency (2016)
43366,168698,Peng! Du bist tot! (1987)
43373,168716,A Gathering of Cats (2007)
43379,168728,Would-Be Gentleman (1968)
43387,168746,Escape from the Newsroom (2002)
43388,168748,Pearl (2016)
43391,168754,Beautiful Beast (2013)
43406,168790,The Body Farm (2011)
43414,168810,Today (1997)
49167,183335,The Mysterious Castle in the Carpathians (1981)


In [16]:
likedMovies[["movieId", "title"]]

Unnamed: 0,movieId,title
24,25,Leaving Las Vegas (1995)
309,319,Shallow Grave (1994)
737,778,Trainspotting (1996)
842,897,For Whom the Bell Tolls (1943)
858,913,"Maltese Falcon, The (1941)"
881,936,Ninotchka (1939)
939,999,2 Days in the Valley (1996)
974,1035,"Sound of Music, The (1965)"
1101,1175,Delicatessen (1991)
1171,1250,"Bridge on the River Kwai, The (1957)"
