In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score
sns.set()

In [7]:
train_data = pd.read_csv('../../Data/serendipity-sac2018/training.csv')

In [8]:
train_data['timestamp'] = train_data['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000))
train_data['year'] = train_data['timestamp'].apply(lambda x: x.year)
train_data['month'] = train_data['timestamp'].apply(lambda x: x.month)


In [9]:
train_data = train_data[train_data['year'] > 2015]

In [10]:
movies = pd.read_csv('../../Data/serendipity-sac2018/movies.csv',error_bad_lines=False)

Skipping line 19833: expected 8 fields, saw 10
Skipping line 34143: expected 8 fields, saw 9
Skipping line 36015: expected 8 fields, saw 10
Skipping line 37260: expected 8 fields, saw 12
Skipping line 44379: expected 8 fields, saw 10
Skipping line 47551: expected 8 fields, saw 10



In [11]:
## Remove movies with no genre 
movies = movies[movies['genres'].isna() == False]
movies.reset_index(drop=True, inplace=True)

## Convert string to array of genres
movies['genres'] = movies['genres'].apply(lambda x: x.split(','))

In [12]:
vect = CountVectorizer()
X = vect.fit_transform(movies.genres.str.join(' '))
movies = movies.join(pd.DataFrame(X.toarray(), columns=vect.get_feature_names()))

In [28]:
movies.head()

Unnamed: 0,movieId,title,releaseDate,directedBy,starring,imdbId,tmdbId,genres,action,adventure,...,horror,imax,musical,mystery,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),1995-11-19,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",114709,862.0,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),1995-12-15,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",113497,8844.0,"[Adventure, Children, Fantasy]",0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),1995-01-01,Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",113228,15602.0,"[Comedy, Romance]",0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),1996-01-15,Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",114885,31357.0,"[Comedy, Drama, Romance]",0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),1995-12-08,Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",113041,11862.0,[Comedy],0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# movies.drop('movieId', axis = 1)
# movies.columns[:8]
data = movies.drop(movies.columns[:8], axis = 1)
len(data.columns)

21

In [None]:
## Run KModes clustering on [2,10] clusters to identify optimal no. of clusters based on silhouette_score


sil = []
kmax = 10

# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in range(2, kmax+1):
    print("K = " + str(k))
    km = KModes(n_clusters=k, init='Huang', n_init=3, verbose=0)
    clusters = km.fit_predict(data)
    sil.append(silhouette_score(data, clusters, metric = 'euclidean'))

In [None]:
plt.plot(sil)

**Picking 5 clusters**

In [33]:
km = KModes(n_clusters=5, init='Huang', n_init=3, verbose=0)
clusters = km.fit_predict(data)
movies['clusters'] = clusters

In [34]:
train_data = train_data.merge(movies[['movieId','clusters']], on = 'movieId')

In [35]:
## Avg ratings for each cluster per user

avg_rating = train_data[['userId', 'clusters','rating']].groupby(['userId', 'clusters']).agg('mean')
avg_rating

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,clusters,Unnamed: 2_level_1
100032,0,3.752475
100032,1,3.772727
100032,2,3.833333
100032,3,3.857143
100032,4,3.812500
100036,0,3.925000
100036,1,3.727273
100036,2,4.000000
100036,3,4.255319
100036,4,3.854839


In [36]:
UM_matrix = train_data.pivot(index='userId', columns='movieId', values='rating')

In [None]:
# UM_matrix

In [None]:
import math
for userId, row in UM_matrix.iterrows():    
    for movieId in UM_matrix.columns:
        if math.isnan(UM_matrix.loc[userId, movieId]):
            cluster = movies.loc[movies['movieId'] == movieId,'clusters'].values[0]
            try:
                UM_matrix.loc[userId, movieId] = avg_rating.loc[userId, cluster].values[0]
            except TypeError:
                ## If a user has not rated a movie in that cluster, assign user's average rating across all clusters.
                UM_matrix.loc[userId, movieId] =avg_rating.loc[userId, :].agg('mean').values[0]
            except:
                print "Unknown error for " + str(userId) + ";" + str(movieId)
            

## Probably not the most efficient way to do this!!