# 추천 시스템

- 사용자 집단별 추천
    - 성별

## 전처리

In [39]:
import pandas as pd
import numpy as np

In [2]:
## 사용자
users = pd.read_csv('users.csv')
users[:2]

Unnamed: 0,user_id,age,gender,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043


In [3]:
## 영화 평점
ratings = pd.read_csv('ratings.csv')
ratings[:2]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,55,5,875072688
1,1,203,4,878542231


In [4]:
## 컬럼 삭제 : timestamp
ratings.drop('timestamp', axis=1, inplace=True)
ratings[:2]

Unnamed: 0,user_id,movie_id,rating
0,1,55,5
1,1,203,4


In [5]:
## 영화 정보
movies = pd.read_csv('movies.csv')
movies[:2]

Unnamed: 0,movie_id,title,release date,imdb url,action,adventure,animation,children,comedy,crime,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
## 컬럼 정리 : movie_id, title만 사용
movies = movies[['movie_id', 'title']]
movies[:2]

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)


## 데이터 셋 분리
- Train data
    - 모델을 학습하기 위한 데이터 셋
    - 학습은 최적의 파라미터를 찾는 것
    - 학습을 위한 데이터
- Test data
    - 모델의 '최종 성능'을 평가하기 위한 데이터 셋
    - 모델 학습에 관여하지 않음
- Train data로 학습하고, Test data로 최종 성능 평가

In [12]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.2.2-cp38-cp38-win_amd64.whl (8.3 MB)
Collecting scipy>=1.3.2
  Using cached scipy-1.10.1-cp38-cp38-win_amd64.whl (42.2 MB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


### train, test set 분리

In [7]:
## train, test set 분리
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings.user_id

In [30]:
## x_train : 학습용 데이터
## x_test : 정확도 검증용 데이터
## y_train, y_test : 사용자 ID
x_train, x_test, y_train, y_test = train_test_split(x,               ## feature data
                                                    y,               ## target data
                                                    test_size=0.25,  ## test data size
                                                    random_state=1,  ## random seed  
                                                    #shuffle=True,   ## default : True
                                                    stratify=y,      ## 값 기준으로 train, test data 분리
                                                   )

### train, test 크기 확인

In [23]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,55,5
1,1,203,4
2,1,183,5
3,1,150,5
4,1,68,4
...,...,...,...
99995,943,427,4
99996,943,12,5
99997,943,284,2
99998,943,62,3


In [11]:
x.shape

(100000, 3)

In [12]:
## 전체 data(row) : 75%
x_train.shape

(75000, 3)

In [13]:
## 전체 data(row) : 25%
x_test.shape

(25000, 3)

In [14]:
y_train.shape

(75000,)

In [15]:
y_test.shape

(25000,)

### random_state : 설정했을 때

In [16]:
## [random_state=1] ##
## 첫 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
52205,458,531,5
266,1,6,5
23482,236,56,5
78508,721,303,3
71009,648,161,3
...,...,...,...
50057,447,760,4
98047,922,62,3
5192,49,209,5
77708,712,1053,4


In [18]:
## [random_state=1] ##
## 두 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
52205,458,531,5
266,1,6,5
23482,236,56,5
78508,721,303,3
71009,648,161,3
...,...,...,...
50057,447,760,4
98047,922,62,3
5192,49,209,5
77708,712,1053,4


### random_state : 설정하지 않았을 때

In [20]:
## [random_state 설정하지 않음] ##
## 첫 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
54281,479,500,4
28535,280,220,5
20727,210,402,5
19568,200,1228,4
57957,514,68,4
...,...,...,...
60531,537,844,4
83803,784,260,4
41204,379,202,5
55885,495,201,2


In [22]:
## [random_state 설정하지 않음] ##
## 두 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
35944,330,200,5
84675,794,248,4
50562,450,99,4
27295,273,900,3
42055,385,173,4
...,...,...,...
30747,296,696,4
20710,210,154,4
66292,601,820,1
28865,283,91,5


In [25]:
ratings.groupby('user_id').size()

user_id
1      272
2       62
3       54
4       24
5      175
      ... 
939     49
940    107
941     22
942     79
943    168
Length: 943, dtype: int64

### stratify : 설정했을 때

In [31]:
## [stratify=y]
x_train.groupby('user_id').size()

user_id
1      204
2       46
3       40
4       18
5      131
      ... 
939     37
940     80
941     16
942     59
943    126
Length: 943, dtype: int64

In [32]:
272 * 0.75

204.0

In [33]:
62 * 0.75

46.5

In [34]:
54 * 0.75

40.5

### stratify  : 설정하지 않았을 때

In [26]:
## [stratify 설정하지 않음]
x_train.groupby('user_id').size()

user_id
1      201
2       47
3       47
4       19
5      137
      ... 
939     36
940     72
941     18
942     56
943    124
Length: 943, dtype: int64

In [27]:
272 * 0.75

204.0

In [28]:
62 * 0.75

46.5

In [29]:
54 * 0.75

40.5

In [25]:
ratings.groupby('user_id').size()

user_id
1      272
2       62
3       54
4       24
5      175
      ... 
939     49
940    107
941     22
942     79
943    168
Length: 943, dtype: int64

## 정확도 (Accuracy)
- 10분 동안 줄넘기 횟수

이름 | 홍길동 | 박보검 | 이미자
-- | -- | -- | --
예측 | 50 | 35 | 40
실제 | 60 | 20 | 45

- 오차(잔차) : 실제값 - 예측값 (y - y^)
    - 오차 : 10, -15, 5
- 오차(잔차) 합 : 0이 되거나 -가 될 수 있음 -> 그래서 제곱의 합을 구함
- 평균 제곱 오차(MSE : Mean Square Error) 
- 평균 제곱근 오차(RMSE : Root Mean Square Error)

In [35]:
15 * 15

225

In [37]:
## MSE
mse = ((10)**2 + (-15)**2 + 5**2)/3
mse

116.66666666666667

In [38]:
## RMSE
import math

math.sqrt(mse)

10.801234497346433

### RMSE 정의

In [57]:
## 정확도(RMSE) 계산하는 함수 정의
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

### 모델별 예측치의 정확도 계산

In [58]:
## 모델별 RMSE를 계산하는 함수 정의
## : 모델별 예측치의 정확도 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])    
#     for p in id_pairs:
#         print(p)
    
    y_pred = np.array([best_seller(user, movie) for (user, movie) in id_pairs])
    print('y_pred shape >>', y_pred.shape)
    print('y_pred >>', y_pred)
    
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)    

In [41]:
x_test[:2]

Unnamed: 0,user_id,movie_id,rating
10168,94,31,4
27028,271,697,4


In [43]:
for z in zip([1, 3, 5], ['a', 'b', 'c']):
    print(z)

(1, 'a')
(3, 'b')
(5, 'c')


## 모델

In [46]:
## 영화별 평점 평균
train_mean = x_train.groupby('movie_id').rating.mean()

In [48]:
train_mean[:3]

movie_id
1    3.888889
2    3.147059
3    3.013699
Name: rating, dtype: float64

### best-seller 모델 : 예측

In [50]:
## 영화별 평점 평균을 예측치로 계산하는 기본 모델
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        print('movie_id 없음 >>', movie_id)
        rating = 3.0
    return rating

In [55]:
train_mean[31]

3.6608695652173915

## best-seller 정확도 측정

In [59]:
score(best_seller)

movie_id 없음 >> 1372
movie_id 없음 >> 1505
movie_id 없음 >> 1673
movie_id 없음 >> 1458
movie_id 없음 >> 1581
movie_id 없음 >> 1641
movie_id 없음 >> 1372
movie_id 없음 >> 1682
movie_id 없음 >> 1649
movie_id 없음 >> 1329
movie_id 없음 >> 814
movie_id 없음 >> 1669
movie_id 없음 >> 1309
movie_id 없음 >> 1125
movie_id 없음 >> 1596
movie_id 없음 >> 1614
movie_id 없음 >> 1661
movie_id 없음 >> 1358
movie_id 없음 >> 1564
movie_id 없음 >> 1525
movie_id 없음 >> 1130
movie_id 없음 >> 1125
movie_id 없음 >> 1671
movie_id 없음 >> 1667
movie_id 없음 >> 1630
movie_id 없음 >> 1678
movie_id 없음 >> 1457
movie_id 없음 >> 1125
movie_id 없음 >> 1372
movie_id 없음 >> 1340
movie_id 없음 >> 1650
movie_id 없음 >> 1498
movie_id 없음 >> 1606
movie_id 없음 >> 1636
movie_id 없음 >> 1557
movie_id 없음 >> 1580
movie_id 없음 >> 1526
movie_id 없음 >> 1660
movie_id 없음 >> 1570
movie_id 없음 >> 1125
movie_id 없음 >> 1621
movie_id 없음 >> 1532
movie_id 없음 >> 1571
movie_id 없음 >> 1663
movie_id 없음 >> 1532
movie_id 없음 >> 1482
movie_id 없음 >> 1563
movie_id 없음 >> 1358
movie_id 없음 >> 1659
movie_id 없음 >> 1460
m

1.0203673133357947

In [68]:
train_mean[:2]

movie_id
1    3.888889
2    3.147059
Name: rating, dtype: float64

## Gender 기준 추천 모델

In [70]:
## 영화별 성별별 평점 평균 계산
users[:2] ## 성별 

Unnamed: 0,user_id,age,gender,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043


In [74]:
ratings[:2]

Unnamed: 0,user_id,movie_id,rating
0,1,55,5
1,1,203,4


In [86]:
one = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['홍길동', '이미자', '박보검']
})

two = pd.DataFrame({
    'id': [1, 2, 5],
    'hire_date': [2020, 2010, 2002]
})

pd.merge(one, two)

Unnamed: 0,id,name,hire_date
0,1,홍길동,2020
1,2,이미자,2010


In [88]:
## x_train를 users와 merge
merged_ratings = pd.merge(x_train, users)
merged_ratings

Unnamed: 0,user_id,movie_id,rating,age,gender,job,zip_code
0,616,678,2,55,M,scientist,50613
1,616,339,3,55,M,scientist,50613
2,616,895,3,55,M,scientist,50613
3,616,362,3,55,M,scientist,50613
4,616,258,4,55,M,scientist,50613
...,...,...,...,...,...,...,...
74995,820,289,2,22,M,student,37725
74996,820,288,5,22,M,student,37725
74997,820,333,5,22,M,student,37725
74998,820,751,1,22,M,student,37725


In [96]:
## 영화별 성별별 평점 평균 계산
g_mean = merged_ratings[['movie_id', 'gender', 'rating']].groupby(['movie_id', 'gender']).rating.mean()
g_mean

movie_id  gender
1         F         3.897727
          M         3.885827
2         F         3.428571
          M         3.102273
3         F         2.666667
                      ...   
1676      M         2.000000
1677      F         3.000000
1679      M         3.000000
1680      M         2.000000
1681      M         3.000000
Name: rating, Length: 3043, dtype: float64

In [93]:
for g in g_mean:
    print(g)

((1, 'F'),        movie_id gender  rating
804           1      F       4
3819          1      F       4
4301          1      F       4
4890          1      F       1
5039          1      F       3
...         ...    ...     ...
74306         1      F       4
74336         1      F       5
74478         1      F       4
74485         1      F       5
74869         1      F       4

[88 rows x 3 columns])
((1, 'M'),        movie_id gender  rating
214           1      M       3
655           1      M       4
1021          1      M       5
1124          1      M       4
1455          1      M       5
...         ...    ...     ...
73934         1      M       4
74226         1      M       5
74432         1      M       4
74610         1      M       3
74907         1      M       5

[254 rows x 3 columns])
((2, 'F'),        movie_id gender  rating
13484         2      F       5
17781         2      F       3
20269         2      F       1
24804         2      F       3
32163         2    

In [97]:
x_train.shape

(75000, 3)

In [99]:
users.shape

(943, 5)

In [101]:
users.set_index('user_id', inplace=True)
users[:2]

Unnamed: 0_level_0,age,gender,job,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043


In [103]:
x_train[:2]

Unnamed: 0,user_id,movie_id,rating
67396,616,678,2
94307,889,232,3


In [107]:
## x_train -->full matrix
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1672,1674,1675,1676,1677,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,,,3.0,,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,,3.0,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


## 정확도 계산

In [115]:
## 모델별 RMSE를 계산하는 함수 정의
## : 모델별 예측치의 정확도 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    
    y_pred = np.array([cf_gender(user, movie) for (user, movie) in id_pairs])
    print(y_pred)
    
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

In [144]:
#g_mean[movie_id][gender]
g_mean[1].M

3.8858267716535435

In [133]:
## gender 기준 추천 모델
## gender별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id].gender
        
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0            
    else:
        gender_rating = 3.0
    return gender_rating        

In [116]:
score(cf_gender)

[3.65656566 3.13043478 4.38507463 ... 3.55882353 3.5        3.63114754]


1.0306597409002642