In [68]:
import pandas as pd
import matplotlib as plt
import numpy as np

In [119]:
user_rate_df = pd.read_csv('./ml-latest/ratings.csv')
movies_df = pd.read_csv('./ml-latest/movies.csv', index_col=0)
tags_df = pd.read_csv('./ml-latest/genome-tags.csv')
tag_relevance_df = pd.read_csv('./ml-latest/genome-scores.csv')

# Check data & Preprocess

In [120]:
user_rate_df.shape, movies_df.shape, tags_df.shape, tag_relevance_df.shape

((27753444, 4), (58098, 2), (1128, 2), (14862528, 3))

## Ratings.csv

In [121]:
user_rate_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
...,...,...,...,...
27753439,283228,8542,4.5,1379882795
27753440,283228,8712,4.5,1379882751
27753441,283228,34405,4.5,1379882889
27753442,283228,44761,4.5,1354159524


In [122]:
# drop timestamp / del movieId have not relevance 
user_rate_df = user_rate_df.drop(columns='timestamp')

user_rate_df = user_rate_df[user_rate_df['movieId'].isin(tag_relevance_df.movieId.values)]

user_rate_df

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5
...,...,...,...
27753439,283228,8542,4.5
27753440,283228,8712,4.5
27753441,283228,34405,4.5
27753442,283228,44761,4.5


## Movies.csv

In [123]:
movies_df

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193876,The Great Glinka (1946),(no genres listed)
193878,Les tribulations d'une caissière (2011),Comedy
193880,Her Name Was Mumu (2016),Drama
193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi


In [124]:
# chage genres to each genres by one-hot encoding / drop imax
movies_genre_df = movies_df['genres'].str.get_dummies(sep = '|')

movies_genre_df.drop(columns='IMAX', inplace=True)

# drop data have not genres
movies_genre_df = movies_genre_df[movies_genre_df['(no genres listed)'] == 0].drop(columns='(no genres listed)')

movies_genre_df


Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193874,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
193878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
193880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193882,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0


## Tags.csv

In [126]:
tags_df

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


## Tag Relevance.csv

In [127]:
tag_relevance_df.head()

tag_genre_dict = {'Action': 19, 'Adventure': 29, 'Animation': 64, 'Children': 204, 'Comedy': 230, 'Crime': 268, 'Documentary': 315, 'Drama': 323, 'Fantasy': 377, 'Film-Noir': 393, 'Horror': 522, 'Musical': 686, 'Mystery': 689, 'Romance': 863, 'Sci-Fi': 887, 'Thriller': 1025, 'War': 1096, 'Western': 1107}

tag_rel_small_df = tag_relevance_df[tag_relevance_df['tagId'].isin((pd.DataFrame([tag_genre_dict]).values))]

tag_rel_small_df

Unnamed: 0,movieId,tagId,relevance
18,1,19,0.66825
28,1,29,0.90700
63,1,64,0.98875
203,1,204,0.95450
229,1,230,0.59475
...,...,...,...
14862262,187595,863,0.26750
14862286,187595,887,0.12850
14862424,187595,1025,0.40150
14862495,187595,1096,0.63700


# Chack User

## Choose the User

In [144]:
# select the user with the 2nd, 3rd and 4th highest number of grade.
user_rate_df.userId.value_counts().head()

123100    9279
117490    6998
242683    6901
212343    6195
63783     5728
Name: userId, dtype: int64

In [145]:
user = 117490

In [129]:
# make df that [ movieId, rating ] of the grade with userId 117490 in the df
user_grade_df = user_rate_df[user_rate_df['userId']==user].iloc[:,1:3]
user_grade_df.head()

Unnamed: 0,movieId,rating
11445100,1,4.0
11445101,2,4.0
11445102,3,3.0
11445103,4,3.0
11445104,5,3.0


## Filter by user

In [130]:
# movie filter by user
user_movie_df = movies_genre_df.loc[list(set(movies_genre_df.index) & set(user_grade_df.movieId))]
user_movie_df.shape

(6998, 18)

In [135]:
# movie filter by ~user
user_movie_noshow_df = movies_genre_df.loc[list(set(movies_genre_df.index) - set(user_grade_df.movieId))].sort_index()
user_movie_noshow_df = user_movie_noshow_df[user_movie_noshow_df.index.isin(tag_relevance_df.movieId.values)]
user_movie_noshow_df.shape

(6149, 18)

In [136]:
# user rate fileter by movie
user_grade_df = user_grade_df[user_grade_df['movieId'].isin(tag_relevance_df.movieId.values)]
user_grade_df.shape

(6998, 2)

In [138]:
# movie relevance fileter by user
user_relevance_df = tag_rel_small_df[tag_rel_small_df['movieId'].isin(user_grade_df.movieId.values)]
user_relevance_df.shape

(125964, 3)

## Calcurate genre relationship

In [139]:
# calcurate the sum of the genre relationship values for each movie by genre
user_genre_sum_dict = {'Action': 0, 'Adventure': 0, 'Animation': 0, 'Children': 0, 'Comedy': 0, 'Crime': 0, 'Documentary': 0, 'Drama': 0, 'Fantasy': 0, 'Film-Noir': 0, 'Horror': 0, 'Musical': 0, 'Mystery': 0, 'Romance': 0, 'Sci-Fi': 0, 'Thriller': 0, 'War': 0, 'Western': 0}
tag_genre_dict = {'Action': 19, 'Adventure': 29, 'Animation': 64, 'Children': 204, 'Comedy': 230, 'Crime': 268, 'Documentary': 315, 'Drama': 323, 'Fantasy': 377, 'Film-Noir': 393, 'Horror': 522, 'Musical': 686, 'Mystery': 689, 'Romance': 863, 'Sci-Fi': 887, 'Thriller': 1025, 'War': 1096, 'Western': 1107}

for movieId, rating in zip(user_grade_df.movieId, user_grade_df.rating):
    for genre_key in user_genre_sum_dict:
        if user_movie_df.loc[movieId][genre_key] == 1:
            user_genre_sum_dict[genre_key] += round(rating * user_relevance_df[(user_relevance_df['movieId'] == movieId) & (user_relevance_df['tagId'] == tag_genre_dict[genre_key])].relevance.values[0], 2)
user_genre_sum_dict

{'Action': 2454.67,
 'Adventure': 1544.7200000000003,
 'Animation': 625.8200000000003,
 'Children': 839.7400000000002,
 'Comedy': 4557.9499999999925,
 'Crime': 1845.3200000000018,
 'Documentary': 601.18,
 'Drama': 6696.609999999982,
 'Fantasy': 704.92,
 'Film-Noir': 355.09999999999985,
 'Horror': 880.9000000000003,
 'Musical': 816.8200000000005,
 'Mystery': 739.3600000000007,
 'Romance': 2327.5700000000015,
 'Sci-Fi': 959.3700000000007,
 'Thriller': 1982.669999999998,
 'War': 841.8200000000003,
 'Western': 572.2000000000004}

In [154]:
# calcurate avg of the genre relationship values for each movie by genre
user_genre_avg_dict = {}
for genre_key, cnt in zip(user_genre_sum_dict, user_movie_df.sum()):
    user_genre_avg_dict[genre_key] = round(user_genre_sum_dict[genre_key]/cnt, 2)
user_genre_avg_dict

{'Action': 2.27,
 'Adventure': 2.0,
 'Animation': 3.21,
 'Children': 2.14,
 'Comedy': 1.76,
 'Crime': 1.95,
 'Documentary': 2.7,
 'Drama': 1.85,
 'Fantasy': 1.89,
 'Film-Noir': 2.93,
 'Horror': 2.18,
 'Musical': 2.46,
 'Mystery': 1.76,
 'Romance': 1.75,
 'Sci-Fi': 2.06,
 'Thriller': 1.57,
 'War': 2.39,
 'Western': 2.67}

# Recommend

In [155]:
# dict to df
users_genre_rel_df = pd.DataFrame([user_genre_avg_dict], index=[user])
users_genre_rel_df

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
117490,2.27,2.0,3.21,2.14,1.76,1.95,2.7,1.85,1.89,2.93,2.18,2.46,1.76,1.75,2.06,1.57,2.39,2.67


In [156]:
for movieId in user_movie_noshow_df.index:
    for genre_key in tag_genre_dict:
        if user_movie_noshow_df.loc[movieId, genre_key] == 0:
            continue
        user_movie_noshow_df.loc[movieId, genre_key] = user_movie_noshow_df.loc[movieId, genre_key] * tag_rel_small_df[(tag_rel_small_df['movieId'] == movieId) & (tag_rel_small_df['tagId'] == tag_genre_dict[genre_key])].relevance.values[0]
user_movie_noshow_df

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
12,0.000000,0.000000,0.0,0.000000,0.720377,0.0,0.000000,0.0,0.000000,0.0,0.077981,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
15,0.956484,0.952088,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.216458,0.000000,0.0,0.0,0.0
33,0.000000,0.272484,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.028224,0.000000,0.0,0.0,0.0
37,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.045796,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
56,0.000000,0.101761,0.0,0.100806,0.023562,0.0,0.000000,0.0,0.325756,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185435,0.097032,0.000000,0.0,0.000000,0.355812,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
185585,0.439569,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.046981,0.0,0.000000,0.0,0.0,0.000000,0.304152,0.0,0.0,0.0
186587,0.318096,0.069828,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.324615,0.0,0.0,0.0
187593,0.702663,0.000000,0.0,0.000000,0.670352,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.023562,0.0,0.0,0.0


In [214]:
# user_rec_df = users_genre_rel_df.dot(user_movie_noshow_df.T)
# user_rec_df
test03 = user_movie_noshow_df.T
user_rec_df = users_genre_rel_df.dot(test03)
user_rec_df.T.sort_values(by=user, ascending=False)


Unnamed: 0_level_0,117490
movieId,Unnamed: 1_level_1
98243,7.878711
71129,7.528257
4366,7.244934
135887,7.163920
166461,7.134749
...,...
1039,0.005168
25771,0.003743
797,0.003446
751,0.003381


In [202]:
'1st: ' + movies_df.loc[98243].title, '2nd: ' + movies_df.loc[71129].title, '3rd: ' + movies_df.loc[4366].title

('1st: Rise of the Guardians (2012)',
 '2nd: Green Lantern: First Flight (2009)',
 '3rd: Atlantis: The Lost Empire (2001)')

In [213]:
'1st-title: ' + movies_df.loc[98243].title, '1st-genre: ' + movies_df.loc[98243].genres, 'last-title: ' + movies_df.loc[51573].title, 'last-genre: ' + movies_df.loc[51573].genres

('1st-title: Rise of the Guardians (2012)',
 '1st-genre: Adventure|Animation|Children|Fantasy|IMAX',
 'last-title: Meshes of the Afternoon (1943)',
 'last-genre: Fantasy')

In [215]:
for movieId in 
movies_df.loc[98243].genres.split('|')

['Adventure', 'Animation', 'Children', 'Fantasy', 'IMAX']