In [3]:
import numpy as np
import pandas as pd

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]


# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')


In [4]:
#### (1)
# train set의 모든 가능한 사용자 pair의 cosine similarity 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=matrix_dummy.index, columns=matrix_dummy.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.099221,0.014639,0.048798,0.311128,0.339241,0.317174,0.223463,0.076384,0.318248,...,0.320056,0.064106,0.162864,0.140093,0.128687,0.091376,0.233804,0.080666,0.131728,0.275461
2,0.099221,1.000000,0.072084,0.089271,0.049304,0.159383,0.082399,0.048026,0.116692,0.145372,...,0.090924,0.257303,0.240647,0.247855,0.234932,0.181213,0.132742,0.114431,0.110844,0.061768
3,0.014639,0.072084,1.000000,0.218640,0.027279,0.049488,0.057884,0.020632,0.055340,0.052009,...,0.018575,0.036203,0.157879,0.007714,0.061137,0.018229,0.084984,0.058442,0.097113,0.015142
4,0.048798,0.089271,0.218640,1.000000,0.041365,0.078181,0.050937,0.141229,0.072342,0.062758,...,0.069202,0.047325,0.129444,0.080669,0.037706,0.000000,0.148125,0.073341,0.120601,0.031670
5,0.311128,0.049304,0.027279,0.041365,1.000000,0.179026,0.284765,0.230547,0.065924,0.152184,...,0.334424,0.071062,0.078650,0.050122,0.111199,0.075019,0.167027,0.115505,0.127354,0.245291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.091376,0.181213,0.018229,0.000000,0.075019,0.076201,0.087869,0.122166,0.050062,0.079621,...,0.039487,0.420839,0.206577,0.130257,0.361905,1.000000,0.085421,0.183981,0.056005,0.115062
940,0.233804,0.132742,0.084984,0.148125,0.167027,0.240778,0.209494,0.235548,0.088168,0.239962,...,0.237182,0.124406,0.142043,0.123699,0.125368,0.085421,1.000000,0.155184,0.211415,0.184481
941,0.080666,0.114431,0.058442,0.073341,0.115505,0.107490,0.041764,0.091386,0.131606,0.103265,...,0.040402,0.073496,0.210989,0.164055,0.301881,0.183981,0.155184,1.000000,0.063370,0.046842
942,0.131728,0.110844,0.097113,0.120601,0.127354,0.197898,0.218997,0.148015,0.095010,0.166292,...,0.158050,0.027261,0.066194,0.063506,0.027386,0.056005,0.211415,0.063370,1.000000,0.160388


In [21]:
# (1)
#### train set의 모든 가능한 사용자의 pearson correlation 계산
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = matrix_dummy.T.corr('pearson')
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.085438,0.008552,0.015317,0.205262,0.207627,0.226455,0.195752,0.052019,0.216161,...,0.222342,0.066156,0.117450,0.093248,0.097698,0.072094,0.193578,0.106659,0.082423,0.235284
2,0.085438,1.000000,0.095160,0.122315,0.057488,0.166793,0.014797,-0.004163,0.068438,0.094330,...,0.070765,0.198200,0.183293,0.380873,0.271668,0.129159,0.128165,0.103423,0.168759,0.049139
3,0.008552,0.095160,1.000000,0.271831,-0.010201,0.019844,-0.011359,0.027293,-0.014677,-0.005842,...,-0.022266,0.037625,0.106183,0.060356,0.030168,0.012406,0.105487,0.071047,0.104022,-0.004335
4,0.015317,0.122315,0.271831,1.000000,-0.013515,0.011242,0.019018,0.126386,-0.010253,0.009860,...,-0.029180,0.036872,0.054241,0.150097,0.098068,0.025169,0.021073,0.054724,0.146689,-0.011704
5,0.205262,0.057488,-0.010201,-0.013515,1.000000,0.133215,0.227989,0.155140,0.050315,0.092977,...,0.166123,0.048338,0.021246,0.032450,0.097004,0.049598,0.170844,0.125120,0.038275,0.209527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.072094,0.129159,0.012406,0.025169,0.049598,0.022684,0.019895,0.076445,0.036826,0.006204,...,0.035626,0.349304,0.154651,0.133114,0.313280,1.000000,0.039718,0.097360,0.012225,0.110765
940,0.193578,0.128165,0.105487,0.021073,0.170844,0.219811,0.200600,0.140984,0.086534,0.212910,...,0.187102,0.098487,0.055839,0.155941,0.126465,0.039718,1.000000,0.138240,0.147198,0.134864
941,0.106659,0.103423,0.071047,0.054724,0.125120,0.117576,0.020239,0.107999,0.043164,0.015783,...,0.023848,0.184845,0.159017,0.125868,0.199616,0.097360,0.138240,1.000000,0.053514,0.043692
942,0.082423,0.168759,0.104022,0.146689,0.038275,0.212451,0.161691,0.088362,0.055905,0.114270,...,0.115253,0.052122,-0.005885,0.122276,0.064565,0.012225,0.147198,0.053514,1.000000,0.073077


In [22]:
# 주어진 영화 (movie_id) 가중평균 rating을 계산하는 함수
# 가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def CF_simple(user_id, movie_id):
    mean_rating = 3.0
    if movie_id in rating_matrix:
        sim_score = user_similarity[user_id]
        movie_ratings = rating_matrix[movie_id]
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        sim_score = sim_score.drop(none_rating_idx)
        movie_ratings = movie_ratings.drop(none_rating_idx)
        
        mean_rating = np.dot(sim_score, movie_ratings)/sim_score.sum()
    return mean_rating

score(CF_simple)
        
        

1.0530191104411932

In [6]:
# Neighbor size를 정해서 예측치를 계산하는 함수
def cf_knn(user_id, movie_id, neighbor_size=0):
    prediction = 3.0
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        sim_scores = sim_scores.drop(none_rating_idx)
        movie_ratings = movie_ratings.drop(none_rating_idx)
        
        #### (2) neighbor_size가 지정되지 않은 경우(모든 사용자)
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        
        #### (3) neighbor_size가 지정된 경우
        else:
            if len(sim_scores)>1:
                neighbor_size = min(len(sim_scores), neighbor_size)
                
                # pd.Series -> np.array
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                
                # Sort the user similarity with user_id
                user_idx = np.argsort(sim_scores)
                
                # find the neighbors
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                
                # 특정 user의 movie에 대한 평가를 예측
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                
    return prediction
    
# 정확도 계산
print(score(cf_knn, 30))
    

1.002398745677709


In [7]:
#### (4) 주어진 사용자에 대해 추천을 받기
# 전체 데이터로 full_matrix와 cosine similarity 구하기
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

# 특정 이용자에게 neighbor_size를 잡아 n_items개의 영화를 추천
def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id, :].copy()
    for movie in rating_matrix:
        # 이미 평가한 항목은 추천 대상이 아님
        if pd.notnull(user_movie.loc[movie]):    
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
            
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2, n_items=5, neighbor_size=30)

movie_id
1175          Welcome To Sarajevo (1997)
1189             That Old Feeling (1997)
1467                    Cure, The (1995)
1293    Ayn Rand: A Sense of Life (1997)
1594                     Shopping (1994)
Name: title, dtype: object

In [31]:
#### (5) 최적의 neighbor size 구하기
# train set으로 full matrix와 cosine similarity 구하기
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

for neighbor_size in range(10, 61, 10):
    print('Neighbor size=%d : RMSE = %.4f' %(neighbor_size,score(model=cf_knn, neighbor_size=neighbor_size)))

Neighbor size=10 : RMSE = 1.0310
Neighbor size=20 : RMSE = 1.0163
Neighbor size=30 : RMSE = 1.0133
Neighbor size=40 : RMSE = 1.0135
Neighbor size=50 : RMSE = 1.0140
Neighbor size=60 : RMSE = 1.0144


In [8]:
#### (1)
# 모든 user의 rating 평균과 영화의 평점편차 계산
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T
rating_bias

def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    prediction = rating_mean[user_id]
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_rating = rating_bias[movie_id].copy()
        none_rating_idx = movie_rating[movie_rating.isnull()].index
        sim_scores = sim_scores.drop(none_rating_idx)
        movie_rating = movie_rating.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_rating) / sim_scores.sum()
            prediction += rating_mean[user_id]
            
        else:
            if len(sim_scores)>1:
                neighbor_size = min(len(sim_scores), neighbor_size)
                
                sim_scores = np.array(sim_scores)
                movie_rating = np.array(movie_rating)
                
                user_idx = np.argsort(sim_scores)
                
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_rating = movie_rating[user_idx][-neighbor_size:]
                
                prediction = np.dot(sim_scores, movie_rating) / sim_scores.sum()
                prediction += rating_mean[user_id]
                
    return prediction

print(score(CF_knn_bias, neighbor_size=30))

0.9327927219186831


In [9]:
def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_bias.loc[user_id].copy()
    for movie in rating_bias:
        if pd.notnull(user_movie[movie]):
            user_movie[movie] = 0
        else:
            user_movie[movie] = CF_knn_bias(user_id, movie, neighbor_size)
            
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2, n_items=5, neighbor_size=30)

movie_id
1467                    Cure, The (1995)
851             Bloody Child, The (1996)
1405        When Night Is Falling (1995)
1293    Ayn Rand: A Sense of Life (1997)
1642                   Angel Baby (1995)
Name: title, dtype: object

In [10]:
#### (1)
# 사용자별 공통 평가 수 계산
rating_binary1 = np.array((rating_matrix>0).astype(float))
rating_binary2 = rating_binary1.T
counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index=rating_matrix.index, columns=rating_matrix.index).fillna(0)
counts

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,204.0,8.0,4.0,4.0,47.0,56.0,77.0,17.0,4.0,49.0,...,49.0,6.0,22.0,12.0,20.0,9.0,28.0,5.0,15.0,45.0
2,8.0,47.0,4.0,3.0,3.0,15.0,10.0,3.0,3.0,12.0,...,7.0,9.0,18.0,9.0,14.0,7.0,8.0,4.0,6.0,5.0
3,4.0,4.0,40.0,6.0,1.0,5.0,8.0,2.0,1.0,3.0,...,1.0,1.0,11.0,1.0,4.0,1.0,7.0,1.0,4.0,1.0
4,4.0,3.0,6.0,18.0,2.0,5.0,5.0,4.0,1.0,3.0,...,3.0,1.0,6.0,2.0,2.0,0.0,6.0,2.0,4.0,2.0
5,47.0,3.0,1.0,2.0,131.0,25.0,59.0,15.0,3.0,20.0,...,41.0,4.0,9.0,3.0,10.0,5.0,14.0,4.0,10.0,36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,9.0,7.0,1.0,0.0,5.0,7.0,10.0,4.0,1.0,5.0,...,3.0,13.0,13.0,4.0,18.0,37.0,4.0,5.0,3.0,7.0
940,28.0,8.0,7.0,6.0,14.0,27.0,32.0,12.0,3.0,27.0,...,26.0,5.0,16.0,7.0,9.0,4.0,80.0,6.0,14.0,15.0
941,5.0,4.0,1.0,2.0,4.0,5.0,3.0,3.0,2.0,5.0,...,2.0,2.0,9.0,4.0,10.0,5.0,6.0,17.0,2.0,2.0
942,15.0,6.0,4.0,4.0,10.0,19.0,25.0,7.0,3.0,14.0,...,14.0,1.0,7.0,2.0,2.0,3.0,14.0,2.0,59.0,12.0


In [11]:
def CF_knn_bias_sig(user_id, movie_id, neighbor_size=0):
    prediction = rating_mean[user_id]
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id]
        movie_rating = rating_bias[movie_id]
        
        no_rating = movie_rating.isnull() # (A) 평가를 안한 사람
        common_counts = counts[user_id] # (B) user_id와 영화에 대한 평가 횟수가 겹치는 횟수가 적은 사람
        low_significant = common_counts < SIG_LEVEL # (B)
        none_rating_idx = movie_rating[no_rating | low_significant].index
        
        sim_scores = sim_scores.drop(none_rating_idx)
        movie_rating = movie_rating.drop(none_rating_idx)
        
        if neighbor_size==0:
            prediction = np.dot(sim_scores, movie_rating) / sim_scores.sum()
            prediction += rating_mean[user_id]
            
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, len(sim_scores))
                
                sim_scores = np.array(sim_scores)
                movie_rating = np.array(movie_rating)
                
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_rating = movie_rating[user_idx][-neighbor_size:]
                
                prediction = np.dot(sim_scores, movie_rating)/ sim_scores.sum()
                prediction += rating_mean[user_id]
                
    if prediction > 5:
        prediction=5
    if prediction < 1:
        prediction=1
    return prediction

SIG_LEVEL=3
MIN_RATINGS=2
score(model=CF_knn_bias_sig, neighbor_size=30)

0.93097515588679

In [12]:
#### (1)
# train set의 모든 가능한 아이템 pair으 cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index=rating_matrix_t.index, columns=rating_matrix_t.index)

# 주어진 영화(movie_id)의 가중평균 rating을 계산하는 함수,
# 가중치는 주어진 아이템과 다른 아이템 간의 유사도(item_similarity)
def CF_IBCF(user_id, movie_id, neighbor_size=0):
    mean_rating=3.0
    if movie_id in item_similarity:
        sim_scores = item_similarity[movie_id]
        user_rating = rating_matrix_t[user_id]
        none_rating_idx = user_rating[user_rating.isnull()].index
        sim_scores = sim_scores.drop(none_rating_idx)
        user_rating = user_rating.drop(none_rating_idx)
        
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
        
    return mean_rating

score(CF_IBCF)

1.0031921377941806