In [105]:
import pandas as pd
import matplotlib as plt
import numpy as np

## read csv

In [106]:
user_rate_df = pd.read_csv('./ml-latest/ratings.csv')
movies_genres_df = pd.read_csv('./ml-latest/movies.csv', index_col=0)
tags_df = pd.read_csv('./ml-latest/genome-tags.csv')
tag_relevance_df = pd.read_csv('./ml-latest/genome-scores.csv')

## check data & preprocess

In [107]:
user_rate_df.shape, movies_genres_df.shape, tags_df.shape, tag_relevance_df.shape

((27753443, 4), (58098, 2), (1128, 2), (14862528, 3))

In [108]:
user_rate_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [109]:
movies_genres_df.head()

# chage genres to each genres by one-hot encoding
movies_genre_df = movies_genres_df['genres'].str.get_dummies(sep = '|')
movies_genre_df.drop(columns='IMAX', inplace=True)
movies_genre_df.head()


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [110]:
movies_genre_df['(no genres listed)'].value_counts()
# 0    53832
# 1     4266 <- drop

# drop data have not genres
movies_genre_df = movies_genre_df[movies_genre_df['(no genres listed)'] == 0]
movies_genre_df['(no genres listed)'].value_counts()

0    53832
Name: (no genres listed), dtype: int64

In [111]:
tags_df.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [112]:
tag_relevance_df.head()
tag_relevance_df.tagId.value_counts()

1       13176
750     13176
756     13176
755     13176
754     13176
        ...  
383     13176
384     13176
385     13176
386     13176
1128    13176
Name: tagId, Length: 1128, dtype: int64

## User

In [113]:
# select the user with the 2nd, 3rd and 4th highest number of grade.
user_rate_df.userId.value_counts().head()
# 117490 <- choose

123100    23715
117490     9278
134596     8381
212343     7884
242683     7515
Name: userId, dtype: int64

In [114]:
# make df that [ movieId, rating ] of the grade with userId 117490 in the df
user = 117490
user_grade_df = user_rate_df[user_rate_df['userId']==user].iloc[:,1:3]
user_grade_df.head()

Unnamed: 0,movieId,rating
11445100,1,4.0
11445101,2,4.0
11445102,3,3.0
11445103,4,3.0
11445104,5,3.0


## Filter by user

In [130]:
# movie df filter by user
user_movie_df = movies_genre_df.loc[list(set(movies_genre_df.index) & set(user_grade_df.movieId))]
user_movie_df.shape

(6998, 19)

In [133]:
user_movie_noshow_df = movies_genre_df.loc[list(set(movies_genre_df.index) - set(user_grade_df.movieId))]
user_movie_noshow_df.shape

(46834, 19)

In [125]:
# user rate fileter by movie
user_grade_df = user_grade_df[user_grade_df['movieId'].isin(tag_relevance_df.movieId.values)]
user_grade_df.shape

(6998, 2)

In [123]:
# movie relevance fileter by user
user_relevance_df = tag_relevance_df[tag_relevance_df['movieId'].isin(user_grade_df.movieId.values)]
user_relevance_df.shape

(7893744, 3)

## Calcurate genre relationship

In [127]:
# calcurate the sum of the genre relationship values for each movie by genre
user_genre_sum_dict = {'Action': 0, 'Adventure': 0, 'Animation': 0, 'Children': 0, 'Comedy': 0, 'Crime': 0, 'Documentary': 0, 'Drama': 0, 'Fantasy': 0, 'Film-Noir': 0, 'Horror': 0, 'Musical': 0, 'Mystery': 0, 'Romance': 0, 'Sci-Fi': 0, 'Thriller': 0, 'War': 0, 'Western': 0}
tag_genre_dict = {'Action': 19, 'Adventure': 29, 'Animation': 64, 'Children': 204, 'Comedy': 230, 'Crime': 268, 'Documentary': 315, 'Drama': 323, 'Fantasy': 377, 'Film-Noir': 393, 'Horror': 522, 'Musical': 686, 'Mystery': 689, 'Romance': 863, 'Sci-Fi': 887, 'Thriller': 1025, 'War': 1096, 'Western': 1107}

for movieId, rating in zip(user_grade_df.movieId, user_grade_df.rating):
    for genre_key in user_genre_sum_dict:
        if user_movie_df.loc[movieId][genre_key] == 1:
            user_genre_sum_dict[genre_key] += round(rating * user_relevance_df[(user_relevance_df['movieId'] == movieId) & (user_relevance_df['tagId'] == tag_genre_dict[genre_key])].relevance.values[0], 2)
user_genre_sum_dict

{'Action': 2454.67,
 'Adventure': 1544.7200000000003,
 'Animation': 625.8200000000003,
 'Children': 839.7400000000002,
 'Comedy': 4557.9499999999925,
 'Crime': 1845.3200000000018,
 'Documentary': 601.18,
 'Drama': 6696.609999999982,
 'Fantasy': 704.92,
 'Film-Noir': 355.09999999999985,
 'Horror': 880.9000000000003,
 'Musical': 816.8200000000005,
 'Mystery': 739.3600000000007,
 'Romance': 2327.5700000000015,
 'Sci-Fi': 959.3700000000007,
 'Thriller': 1982.669999999998,
 'War': 841.8200000000003,
 'Western': 572.2000000000004}

In [128]:
# calcurate avg of the genre relationship values for each movie by genre
user_genre_avg_dict = {}
for genre_key, cnt in zip(user_genre_sum_dict, user_movie_df.sum()[1:]):
    user_genre_avg_dict[genre_key] = round(user_genre_sum_dict[genre_key]/cnt, 2)
user_genre_avg_dict

{'Action': 1.96,
 'Adventure': 1.6,
 'Animation': 2.98,
 'Children': 1.98,
 'Comedy': 1.38,
 'Crime': 1.48,
 'Documentary': 2.03,
 'Drama': 1.37,
 'Fantasy': 1.68,
 'Film-Noir': 1.4,
 'Horror': 1.91,
 'Musical': 1.49,
 'Mystery': 1.33,
 'Romance': 1.25,
 'Sci-Fi': 1.91,
 'Thriller': 1.35,
 'War': 1.61,
 'Western': 1.21}

## Recommend

In [136]:
# add column that
test01 = pd.DataFrame([user_genre_avg_dict])
test01

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.96,1.6,2.98,1.98,1.38,1.48,2.03,1.37,1.68,1.4,1.91,1.49,1.33,1.25,1.91,1.35,1.61,1.21
