In [23]:
import numpy as np
import pandas as pd

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]


# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')


In [25]:
#### (1)
# train set의 모든 가능한 사용자 pair의 cosine similarity 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=matrix_dummy.index, columns=matrix_dummy.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.090150,0.038675,0.017515,0.300995,0.267927,0.317665,0.168357,0.060169,0.287061,...,0.278048,0.095107,0.168599,0.165417,0.150910,0.083042,0.243335,0.088674,0.134891,0.282196
2,0.090150,1.000000,0.077437,0.157710,0.055840,0.201824,0.081544,0.124552,0.179422,0.150867,...,0.134253,0.313816,0.285830,0.313372,0.290495,0.230615,0.172748,0.131614,0.128954,0.075790
3,0.038675,0.077437,1.000000,0.305140,0.000000,0.076178,0.051604,0.064511,0.022264,0.038761,...,0.041647,0.038045,0.136574,0.035105,0.094970,0.015493,0.117858,0.025239,0.031196,0.000000
4,0.017515,0.157710,0.305140,1.000000,0.029209,0.078581,0.062843,0.155274,0.129183,0.047349,...,0.068021,0.000000,0.123923,0.139276,0.112419,0.039954,0.192220,0.065089,0.085275,0.025768
5,0.300995,0.055840,0.000000,0.029209,1.000000,0.151452,0.277541,0.142731,0.037628,0.148397,...,0.242759,0.012860,0.064288,0.089086,0.101883,0.026184,0.167105,0.056307,0.134972,0.191170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.083042,0.230615,0.015493,0.039954,0.026184,0.101598,0.077412,0.094003,0.000000,0.079988,...,0.027387,0.399308,0.190844,0.193775,0.315581,1.000000,0.064803,0.046679,0.040386,0.073921
940,0.243335,0.172748,0.117858,0.192220,0.167105,0.277430,0.240705,0.186730,0.113443,0.210380,...,0.254087,0.086800,0.137717,0.177434,0.168604,0.064803,1.000000,0.103652,0.187897,0.188459
941,0.088674,0.131614,0.025239,0.065089,0.056307,0.107326,0.021595,0.101308,0.053664,0.074989,...,0.000000,0.146151,0.246895,0.283135,0.249552,0.046679,0.103652,1.000000,0.075193,0.070750
942,0.134891,0.128954,0.031196,0.085275,0.134972,0.248045,0.222920,0.114150,0.121048,0.187193,...,0.186801,0.077923,0.070827,0.098147,0.077922,0.040386,0.187897,0.075193,1.000000,0.125028


In [21]:
# (1)
#### train set의 모든 가능한 사용자의 pearson correlation 계산
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = matrix_dummy.T.corr('pearson')
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.085438,0.008552,0.015317,0.205262,0.207627,0.226455,0.195752,0.052019,0.216161,...,0.222342,0.066156,0.117450,0.093248,0.097698,0.072094,0.193578,0.106659,0.082423,0.235284
2,0.085438,1.000000,0.095160,0.122315,0.057488,0.166793,0.014797,-0.004163,0.068438,0.094330,...,0.070765,0.198200,0.183293,0.380873,0.271668,0.129159,0.128165,0.103423,0.168759,0.049139
3,0.008552,0.095160,1.000000,0.271831,-0.010201,0.019844,-0.011359,0.027293,-0.014677,-0.005842,...,-0.022266,0.037625,0.106183,0.060356,0.030168,0.012406,0.105487,0.071047,0.104022,-0.004335
4,0.015317,0.122315,0.271831,1.000000,-0.013515,0.011242,0.019018,0.126386,-0.010253,0.009860,...,-0.029180,0.036872,0.054241,0.150097,0.098068,0.025169,0.021073,0.054724,0.146689,-0.011704
5,0.205262,0.057488,-0.010201,-0.013515,1.000000,0.133215,0.227989,0.155140,0.050315,0.092977,...,0.166123,0.048338,0.021246,0.032450,0.097004,0.049598,0.170844,0.125120,0.038275,0.209527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.072094,0.129159,0.012406,0.025169,0.049598,0.022684,0.019895,0.076445,0.036826,0.006204,...,0.035626,0.349304,0.154651,0.133114,0.313280,1.000000,0.039718,0.097360,0.012225,0.110765
940,0.193578,0.128165,0.105487,0.021073,0.170844,0.219811,0.200600,0.140984,0.086534,0.212910,...,0.187102,0.098487,0.055839,0.155941,0.126465,0.039718,1.000000,0.138240,0.147198,0.134864
941,0.106659,0.103423,0.071047,0.054724,0.125120,0.117576,0.020239,0.107999,0.043164,0.015783,...,0.023848,0.184845,0.159017,0.125868,0.199616,0.097360,0.138240,1.000000,0.053514,0.043692
942,0.082423,0.168759,0.104022,0.146689,0.038275,0.212451,0.161691,0.088362,0.055905,0.114270,...,0.115253,0.052122,-0.005885,0.122276,0.064565,0.012225,0.147198,0.053514,1.000000,0.073077


In [22]:
# 주어진 영화 (movie_id) 가중평균 rating을 계산하는 함수
# 가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def CF_simple(user_id, movie_id):
    mean_rating = 3.0
    if movie_id in rating_matrix:
        sim_score = user_similarity[user_id]
        movie_ratings = rating_matrix[movie_id]
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        sim_score = sim_score.drop(none_rating_idx)
        movie_ratings = movie_ratings.drop(none_rating_idx)
        
        mean_rating = np.dot(sim_score, movie_ratings)/sim_score.sum()
    return mean_rating

score(CF_simple)
        
        

1.0530191104411932

In [28]:
# Neighbor size를 정해서 예측치를 계산하는 함수
def cf_knn(user_id, movie_id, neighbor_size=0):
    prediction = 3.0
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        sim_scores = sim_scores.drop(none_rating_idx)
        movie_ratings = movie_ratings.drop(none_rating_idx)
        
        #### (2) neighbor_size가 지정되지 않은 경우(모든 사용자)
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        
        #### (3) neighbor_size가 지정된 경우
        else:
            if len(sim_scores)>1:
                neighbor_size = min(len(sim_scores), neighbor_size)
                
                # pd.Series -> np.array
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                
                # Sort the user similarity with user_id
                user_idx = np.argsort(sim_scores)
                
                # find the neighbors
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                
                # 특정 user의 movie에 대한 평가를 예측
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                
    return prediction
    
# 정확도 계산
print(score(cf_knn, 30))
    

1.0132599307881152


In [30]:
#### (4) 주어진 사용자에 대해 추천을 받기
# 전체 데이터로 full_matrix와 cosine similarity 구하기
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

# 특정 이용자에게 neighbor_size를 잡아 n_items개의 영화를 추천
def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id, :].copy()
    for movie in rating_matrix:
        # 이미 평가한 항목은 추천 대상이 아님
        if pd.notnull(user_movie.loc[movie]):    
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
            
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2, n_items=5, neighbor_size=30)

movie_id
850           Two or Three Things I Know About Her (1966)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1293                     Ayn Rand: A Sense of Life (1997)
1516                                  Race the Sun (1996)
64                     What's Eating Gilbert Grape (1993)
Name: title, dtype: object

In [31]:
#### (5) 최적의 neighbor size 구하기
# train set으로 full matrix와 cosine similarity 구하기
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

for neighbor_size in range(10, 61, 10):
    print('Neighbor size=%d : RMSE = %.4f' %(neighbor_size,score(model=cf_knn, neighbor_size=neighbor_size)))

Neighbor size=10 : RMSE = 1.0310
Neighbor size=20 : RMSE = 1.0163
Neighbor size=30 : RMSE = 1.0133
Neighbor size=40 : RMSE = 1.0135
Neighbor size=50 : RMSE = 1.0140
Neighbor size=60 : RMSE = 1.0144


In [43]:
#### (1)
# 모든 user의 rating 평균과 영화의 평점편차 계산
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T
rating_bias

def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    prediction = rating_mean[user_id]
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_rating = rating_bias[movie_id].copy()
        none_rating_idx = movie_rating[movie_rating.isnull()].index
        sim_scores = sim_scores.drop(none_rating_idx)
        movie_rating = movie_rating.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_rating) / sim_scores.sum()
            prediction += rating_mean[user_id]
            
        else:
            if len(sim_scores)>1:
                neighbor_size = min(len(sim_scores), neighbor_size)
                
                sim_scores = np.array(sim_scores)
                movie_rating = np.array(movie_rating)
                
                user_idx = np.argsort(sim_scores)
                
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_rating = movie_rating[user_idx][-neighbor_size:]
                
                prediction = np.dot(sim_scores, movie_rating) / sim_scores.sum()
                prediction += rating_mean[user_id]
                
    return prediction

print(score(CF_knn_bias, neighbor_size=30))

0.944985206067406


In [44]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [48]:
def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_bias.loc[user_id].copy()
    for movie in rating_bias:
        if pd.notnull(user_movie[movie]):
            user_movie[movie] = 0
        else:
            user_movie[movie] = CF_knn_bias(user_id, movie, neighbor_size)
            
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2, n_items=5, neighbor_size=30)

movie_id
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
851                              Bloody Child, The (1996)
1293                     Ayn Rand: A Sense of Life (1997)
1642                                    Angel Baby (1995)
867                               Hearts and Minds (1996)
Name: title, dtype: object

In [49]:
#### (1)
# 사용자별 공통 평가 수 계산
rating_binary1 = np.array((rating_matrix>0).astype(float))
rating_binary2 = rating_binary1.T
counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index=rating_matrix.index, columns=rating_matrix.index).fillna(0)
counts

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,204.0,8.0,4.0,2.0,49.0,47.0,76.0,13.0,3.0,44.0,...,46.0,9.0,22.0,12.0,20.0,8.0,31.0,4.0,16.0,44.0
2,8.0,46.0,5.0,5.0,3.0,21.0,8.0,5.0,5.0,10.0,...,9.0,11.0,21.0,11.0,17.0,8.0,9.0,4.0,6.0,5.0
3,4.0,5.0,40.0,9.0,0.0,8.0,7.0,5.0,1.0,4.0,...,3.0,1.0,10.0,3.0,5.0,1.0,8.0,1.0,1.0,0.0
4,2.0,5.0,9.0,18.0,1.0,5.0,5.0,4.0,2.0,2.0,...,3.0,0.0,5.0,3.0,3.0,1.0,7.0,1.0,3.0,1.0
5,49.0,3.0,0.0,1.0,131.0,21.0,56.0,11.0,2.0,20.0,...,34.0,1.0,9.0,4.0,13.0,1.0,15.0,2.0,12.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,8.0,8.0,1.0,1.0,1.0,9.0,8.0,3.0,0.0,5.0,...,2.0,12.0,13.0,6.0,16.0,37.0,3.0,1.0,2.0,5.0
940,31.0,9.0,8.0,7.0,15.0,31.0,35.0,9.0,4.0,25.0,...,27.0,4.0,14.0,9.0,12.0,3.0,80.0,4.0,12.0,14.0
941,4.0,4.0,1.0,1.0,2.0,6.0,2.0,4.0,1.0,3.0,...,0.0,3.0,10.0,6.0,9.0,1.0,4.0,16.0,2.0,3.0
942,16.0,6.0,1.0,3.0,12.0,23.0,27.0,5.0,4.0,16.0,...,17.0,3.0,7.0,3.0,5.0,2.0,12.0,2.0,59.0,10.0


In [53]:
def CF_knn_bias_sig(user_id, movie_id, neighbor_size=0):
    prediction = rating_mean[user_id]
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id]
        movie_rating = rating_bias[movie_id]
        
        no_rating = movie_rating.isnull() # (A) 평가를 안한 사람
        common_counts = counts[user_id] # (B) user_id와 영화에 대한 평가 횟수가 겹치는 횟수가 적은 사람
        low_significant = common_counts < SIG_LEVEL # (B)
        none_rating_idx = movie_rating[no_rating | low_significant].index
        
        sim_scores = sim_scores.drop(none_rating_idx)
        movie_rating = movie_rating.drop(none_rating_idx)
        
        if neighbor_size==0:
            prediction = np.dot(sim_scores, movie_rating) / sim_scores.sum()
            prediction += rating_mean[user_id]
            
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, len(sim_scores))
                
                sim_scores = np.array(sim_scores)
                movie_rating = np.array(movie_rating)
                
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_rating = movie_rating[user_idx][-neighbor_size:]
                
                prediction = np.dot(sim_scores, movie_rating)/ sim_scores.sum()
                prediction += rating_mean[user_id]
                
    return prediction

SIG_LEVEL=3
MIN_RATINGS=2
score(model=CF_knn_bias_sig, neighbor_size=30)

0.9445011467175631

In [55]:
#### (1)
# train set의 모든 가능한 아이템 pair으 cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index=rating_matrix_t.index, columns=rating_matrix_t.index)

# 주어진 영화(movie_id)의 가중평균 rating을 계산하는 함수,
# 가중치는 주어진 아이템과 다른 아이템 간의 유사도(item_similarity)
def CF_IBCF(user_id, movie_id, neighbor_size=0):
    mean_rating=3.0
    if movie_id in item_similarity:
        sim_scores = item_similarity[movie_id]
        user_rating = rating_matrix_t[user_id]
        none_rating_idx = user_rating[user_rating.isnull()].index
        sim_scores = sim_scores.drop(none_rating_idx)
        user_rating = user_rating.drop(none_rating_idx)
        
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
        
    return mean_rating

score(CF_IBCF)

1.0121106154979118