In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:

genome_scores_data = pd.read_csv('genome_scores_small.csv')
movies_data = pd.read_csv('movies.csv')
ratings_data = pd.read_csv('ratings.csv')


In [3]:
genome_scores_data.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [4]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,date,Unnamed: 5
0,1,1,4.0,964982703,30-07-2000,
1,1,3,4.0,964981247,30-07-2000,
2,1,6,4.0,964982224,30-07-2000,
3,1,47,5.0,964983815,30-07-2000,
4,1,50,5.0,964982931,30-07-2000,


In [6]:
scores_pivot = genome_scores_data.pivot_table(index = ["movieId"],columns = ["tagId"],values = "relevance").reset_index()
scores_pivot.head()

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
1,2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
2,3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
3,4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
4,5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [7]:
#join
mov_tag_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left')
mov_tag_df = mov_tag_df.fillna(0) 
mov_tag_df = mov_tag_df.drop(['title','genres'], axis = 1)
mov_tag_df.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
1,2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
2,3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
3,4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
4,5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [8]:
def set_genres(genres,col):
    if genres in col.split('|'): return 1
    else: return 0

In [9]:
movies_data.head()
mov_genres_df = movies_data

In [10]:
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['genres']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['genres']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['genres']), axis=1)
mov_genres_df["Children"] = mov_genres_df.apply(lambda x: set_genres("Children",x['genres']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['genres']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['genres']), axis=1)
mov_genres_df["Documentary"] = mov_genres_df.apply(lambda x: set_genres("Documentary",x['genres']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['genres']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['genres']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['genres']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['genres']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['genres']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['genres']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['genres']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['genres']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['genres']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['genres']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['genres']), axis=1)
mov_genres_df["(no genres listed)"] = mov_genres_df.apply(lambda x: set_genres("(no genres listed)",x['genres']), axis=1)

In [11]:
mov_genres_df.drop(['title','genres'], axis = 1, inplace=True)
mov_genres_df.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:

def set_year(title):
    year = title.strip()[-5:-1]
    if year.isdigit()== True: return int(year)
    else: return 1800

    movies =  pd.read_csv('movies.csv') 
movies.head()

movies['year'] = movies.apply(lambda x: set_year(x['title']), axis=1)
movies = movies.drop('genres', axis = 1)
movies.head()



Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995


In [13]:
#define function to group years
def set_year_group(year):
    if (year < 1900): return 0
    elif (1900 <= year <= 1975): return 1
    elif (1976 <= year <= 1995): return 2
    elif (1996 <= year <= 2003): return 3
    elif (2004 <= year <= 2009): return 4
    elif (2010 <= year): return 5
    else: return 0
movies['year_group'] = movies.apply(lambda x: set_year_group(x['year']), axis=1)
#no need title and year fields
movies.drop(['title','year'], axis = 1, inplace=True)

In [14]:
agg_movies_rat = ratings_data.groupby(['movieId']).agg({'rating': [np.size, np.mean]}).reset_index()
agg_movies_rat.columns = ['movieId','rating_counts', 'rating_mean']
agg_movies_rat.head()

Unnamed: 0,movieId,rating_counts,rating_mean
0,1,215.0,3.92093
1,2,110.0,3.431818
2,3,52.0,3.259615
3,4,7.0,2.357143
4,5,49.0,3.071429


In [15]:
def set_rating_group(rating_counts):
    if (rating_counts <= 1): return 0
    elif (2 <= rating_counts <= 10): return 1
    elif (11 <= rating_counts <= 100): return 2
    elif (101 <= rating_counts <= 1000): return 3
    elif (1001 <= rating_counts <= 5000): return 4
    elif (5001 <= rating_counts): return 5
    else: return 0
agg_movies_rat['rating_group'] = agg_movies_rat.apply(lambda x: set_rating_group(x['rating_counts']), axis=1)
#no need rating_counts field
agg_movies_rat.drop('rating_counts', axis = 1, inplace=True)
mov_rating_df = movies.merge(agg_movies_rat, left_on='movieId', right_on='movieId', how='left')
mov_rating_df = mov_rating_df.fillna(0)
mov_rating_df.head()

Unnamed: 0,movieId,year_group,rating_mean,rating_group
0,1,2,3.92093,3.0
1,2,2,3.431818,3.0
2,3,2,3.259615,2.0
3,4,2,2.357143,1.0
4,5,2,3.071429,2.0


In [16]:
mov_tag_df = mov_tag_df.set_index('movieId')
mov_genres_df = mov_genres_df.set_index('movieId')
mov_rating_df = mov_rating_df.set_index('movieId')

In [21]:
mov_tag_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,0.032,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,0.0205,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,0.02675,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,0.03025,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,0.02875,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [17]:
#cosine similarity for mov_tag_df
cos_tag = cosine_similarity(mov_tag_df.values)*0.5
#cosine similarity for mov_genres_df
cos_genres = cosine_similarity(mov_genres_df.values)*0.25
#cosine similarity for mov_rating_df
cos_rating = cosine_similarity(mov_rating_df.values)*0.25
#mix
cos = cos_tag+cos_genres+cos_rating

In [18]:
cols = mov_tag_df.index.values
inx = mov_tag_df.index
movies_sim = pd.DataFrame(cos, columns=cols, index=inx)
movies_sim.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.830277,0.67951,0.62569,0.695133,0.561488,0.652209,0.752909,0.517608,0.631803,...,0.294232,0.254226,0.267313,0.300059,0.182429,0.355961,0.376078,0.182429,0.261486,0.318377
2,0.830277,1.0,0.589818,0.552573,0.586534,0.511071,0.565164,0.82438,0.567674,0.661692,...,0.181132,0.174812,0.186109,0.186109,0.181132,0.258278,0.264466,0.181132,0.181132,0.199085
3,0.67951,0.589818,1.0,0.828402,0.856973,0.568878,0.888705,0.606662,0.559938,0.584135,...,0.291585,0.196495,0.333426,0.208426,0.203197,0.296815,0.305259,0.203197,0.203197,0.397479
4,0.62569,0.552573,0.828402,1.0,0.762285,0.540319,0.858629,0.622652,0.534978,0.533659,...,0.302253,0.32734,0.437597,0.233473,0.230085,0.305642,0.313418,0.374422,0.230085,0.375386
5,0.695133,0.586534,0.856973,0.762285,1.0,0.510511,0.817662,0.580336,0.539359,0.545672,...,0.328564,0.197301,0.385168,0.208392,0.203564,0.333392,0.347901,0.203564,0.203564,0.468041


In [22]:
def get_similar(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    return df
#create empty df
movies_similarity = pd.DataFrame(columns=['movieId','sim_moveId','relevance'])

In [23]:
for x in movies_sim.index.tolist():
    movies_similarity = movies_similarity.append(get_similar(x))
movies_similarity.head()

Unnamed: 0,movieId,sim_moveId,relevance
3568,1,4886,0.976029
2355,1,3114,0.968869
1706,1,2294,0.939555
1757,1,2355,0.938364
4360,1,6377,0.932952


## Users dataset

In [20]:
users_df = pd.DataFrame(ratings_data['userId'].unique(), columns=['userId'])
users_df.head()

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5


### Movies

In [21]:
movies_data.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
movies =  pd.read_csv('movies.csv') 
movies_df = movies.drop('genres', axis = 1)
#mean of ratings for each movies
agg_rating_avg = ratings_data.groupby(['movieId']).agg({'rating': np.mean}).reset_index()
agg_rating_avg.columns = ['movieId', 'rating_mean']

movies_df = movies_df.merge(agg_rating_avg, left_on='movieId', right_on='movieId', how='left')
movies_df.head()

Unnamed: 0,movieId,title,rating_mean
0,1,Toy Story (1995),3.92093
1,2,Jumanji (1995),3.431818
2,3,Grumpier Old Men (1995),3.259615
3,4,Waiting to Exhale (1995),2.357143
4,5,Father of the Bride Part II (1995),3.071429


### genres

In [23]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"]
genres_df = pd.DataFrame(genres, columns=['genres'])
genres_df.head()

Unnamed: 0,genres
0,Action
1,Adventure
2,Animation
3,Children
4,Comedy


### users and movies

In [62]:
users_movies_df = ratings_data
users_movies_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### movies and genres

In [25]:
movies_genres_df = movies.drop('title', axis = 1)

In [26]:
def get_movie_genres(movieId):
    movie = movies_genres_df[movies_genres_df['movieId']==movieId]
    genres = movie['genres'].tolist()
    df = pd.DataFrame([b for a in [i.split('|') for i in genres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df

In [27]:
#create empty df
movies_genres=pd.DataFrame(columns=['movieId','genres'])
for x in movies_genres_df['movieId'].tolist():
    movies_genres=movies_genres.append(get_movie_genres(x))
movies_genres.head()

Unnamed: 0,movieId,genres
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy


### users and genres

In [37]:
#join to movies data to get genre information
user_genres_df = ratings_data.merge(movies, left_on='movieId', right_on='movieId', how='left')
#drop columns that will not be used
user_genres_df.drop(['movieId','rating','timestamp','title'], axis = 1, inplace=True)
user_genres_df.head()

Unnamed: 0,userId,genres
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,1,Comedy|Romance
2,1,Action|Crime|Thriller
3,1,Mystery|Thriller
4,1,Crime|Mystery|Thriller


In [38]:
user_genres_df.loc[user_genres_df['userId'] == 2]

Unnamed: 0,userId,genres
232,2,Crime|Drama
233,2,Comedy
234,2,Drama|Romance
235,2,Action|Adventure|Drama
236,2,Action|Crime|Thriller
237,2,Action|Crime|Drama|Thriller
238,2,Action|Comedy
239,2,Crime|Drama|Thriller
240,2,Action|Crime|Drama|IMAX
241,2,Comedy


In [39]:
movies_df.loc[movies_df['movieId'] == 131724]

Unnamed: 0,movieId,title,rating_mean
8828,131724,The Jinx: The Life and Deaths of Robert Durst ...,5.0


In [40]:
def get_favorite_genre(userId):
    user = user_genres_df[user_genres_df['userId']==userId]
    genres = user['genres'].tolist()
    print(genres, userId)
    movie_list = [b for a in [i.split('|') for i in genres] for b in a]
    counter = Counter(movie_list)
    return counter.most_common(1)[0][0]

In [41]:
#create empty df
users_genres = pd.DataFrame(columns=['userId','genre'])
for x in users_df['userId'].tolist():
    print(x)
    users_genres = users_genres.append(pd.DataFrame([[x,get_favorite_genre(x)]], columns=['userId','genre']))
users_genres.head()



Unnamed: 0,userId,genre
0,1,Action
0,2,Drama
0,3,Drama
0,4,Drama
0,5,Drama


In [55]:
users_df.to_csv("output/users.csv", sep=',', header=True, index=False)

In [56]:
movies_df.to_csv("output/movies.csv", sep=',', header=True, index=False)

In [57]:
genres_df.to_csv("output/genres.csv", sep=',', header=True, index=False)

In [63]:
users_movies_df.to_csv("output/users_movies.csv", sep=',', header=True, index=False)

In [59]:
movies_genres.to_csv("output/movies_genres.csv", sep=',', header=True, index=False)

In [60]:
movies_similarity.to_csv("output/movies_similarity.csv", sep=',', header=True, index=False)

In [61]:
users_genres.to_csv("output/users_genres.csv", sep=',', header=True, index=False)