In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
# about rating
path_rating = 'drive/MyDrive/Notebooks/ml-100k/u.data'
# about movie
path_movie = 'drive/MyDrive/Notebooks/ml-100k/u.item'
# about user
path_user = 'drive/MyDrive/Notebooks/ml-100k/u.user'

In [4]:
def parse(s):
  seq = s.split(' | ')
  return list(map(lambda s: s.replace(' ', '_'), seq))

parse('user id | age | gender | occupation | zip code')

['user_id', 'age', 'gender', 'occupation', 'zip_code']

In [5]:
user_col = parse('user id | age | gender | occupation | zip code')
users = pd.read_csv(path_user, sep='|', names=user_col)
# users = users.set_index(user_col[0])
users.head(5)

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
movie_col = parse("movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western")
movies = pd.read_csv(path_movie, sep='|', names=movie_col, encoding='ISO-8859-1')
# movies = movies.set_index(movie_col[0])
movies.head(5)

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [7]:
rating_col = parse('user id | item id | rating | timestamp')
ratings = pd.read_csv(path_rating, sep='\t', names=rating_col, encoding='ISO-8859-1')
# ratings = ratings.set_index(rating_col[0])
ratings.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


**불필요한 컬럼 제거**

In [8]:
# user_id, gender 를 제외하고 전부 제거
filtered_users = users[['user_id', 'gender']]
filtered_users

Unnamed: 0,user_id,gender
0,1,M
1,2,F
2,3,M
3,4,M
4,5,F
...,...,...
938,939,F
939,940,M
940,941,M
941,942,F


In [9]:
filtered_movies = movies[['movie_id', 'movie_title']]
filtered_movies

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [10]:
filtered_ratings = ratings.drop('timestamp', axis=1)
filtered_ratings

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


**filtered_ratings (user_id, item_id, rating) 과 filtered_users (user_id, gender) 병합**

In [None]:
merged_ratings = pd.merge(filtered_ratings, filtered_users)
merged_ratings = merged_ratings.drop('user_id', axis=1)
merged_ratings

Unnamed: 0,item_id,rating,gender
0,242,3,M
1,393,4,M
2,381,4,M
3,251,3,M
4,655,5,M
...,...,...,...
99995,919,5,M
99996,273,3,M
99997,1,5,M
99998,294,4,M


**각 영화의 성별 별로 평점 출력**

In [None]:
gender_mean = merged_ratings.groupby(['item_id', 'gender'])['rating'].mean()
gender_mean

item_id  gender
1        F         3.789916
         M         3.909910
2        F         3.368421
         M         3.178571
3        F         2.687500
                     ...   
1678     M         1.000000
1679     M         3.000000
1680     M         2.000000
1681     M         3.000000
1682     M         3.000000
Name: rating, Length: 3139, dtype: float64

In [None]:
# 61번 item_id에 대해 Female의 평점
gender_mean.loc[(61, 'F')]

3.4

In [None]:
def RMSE(answer, prediction):
  e = (np.array(answer) - np.array(prediction)) ** 2
  return np.sqrt(np.mean((e)))

In [None]:
filtered_users = filtered_users.set_index('user_id')
filtered_ratings = filtered_ratings.set_index('user_id')

In [None]:
display(filtered_users)
display(filtered_ratings)

Unnamed: 0_level_0,gender
user_id,Unnamed: 1_level_1
1,M
2,F
3,M
4,M
5,F
...,...
939,F
940,M
941,M
942,F


Unnamed: 0_level_0,item_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
196,242,3
186,302,3
22,377,1
244,51,2
166,346,1
...,...,...
880,476,3
716,204,5
276,1090,1
13,225,2


In [None]:
rmse = []

for user in set(filtered_ratings.index):
  # 해당 유저의 성별과 실제 평점
  gender = filtered_users.loc[user]['gender']
  y_true = filtered_ratings.loc[user]['rating']
  # 영화별, 성별별로 매긴 평점
  y_pred = gender_mean.loc[(filtered_ratings.loc[user]['item_id'], gender)]
  accuracy = RMSE(y_true, y_pred)
  rmse.append(accuracy)

print(np.mean(rmse))


0.985372066296096


이번 노트북에서는 사용자의 '성별'이라는 특성을 고려하여 저번 RMSE (0.99) 보다 떨어진 것을 확인할 수 있다.

In [None]:
mean_map = dict()
for item_id, gender in gender_mean.index:
  answer = filtered_ratings.loc[item_id]['rating']
  prediction = gender_mean.loc[item_id][gender]
  cost = RMSE(answer, prediction)
  mean_map.setdefault(item_id, list()).append((gender, cost))
mean_map

{1: [('F', 0.7899159663865545), ('M', 0.9099099099099099)],
 2: [('F', 2.3684210526315788), ('M', 2.1785714285714284)],
 3: [('F', 0.6875), ('M', 1.108108108108108)],
 4: [('F', 2.4), ('M', 2.591463414634146)],
 5: [('F', 0.22727272727272707), ('M', 0.859375)],
 6: [('F', 1.6), ('M', 1.5714285714285716)],
 7: [('F', 1.4390243902439024), ('M', 1.1387096774193548)],
 8: [('F', 1.0499999999999998), ('M', 0.9748427672955975)],
 9: [('F', 0.927710843373494), ('M', 0.8842592592592591)],
 10: [('F', 1.7000000000000002), ('M', 1.8695652173913042)],
 11: [('F', 1.2264150943396226), ('M', 1.1311475409836067)],
 12: [('F', 0.666666666666667), ('M', 0.60093896713615)],
 13: [('F', 0.26923076923076916), ('M', 0.47727272727272707)],
 14: [('F', 0.8909090909090911), ('M', 1.0)],
 15: [('F', 0.9456521739130435), ('M', 0.7014925373134329)],
 16: [('F', 1.75), ('M', 1.806451612903226)],
 17: [('F', 0.8461538461538463), ('M', 1.1645569620253164)],
 18: [('F', 2.0), ('M', 1.0)],
 19: [('F', 2.285714285714