In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import heapq
import math

In [2]:
import os
import random
import tensorflow as tf

def seed_everything(seed:int=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

seed_everything(42)

# 데이터 로드

## user 데이터

In [3]:
user_column_names =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
user_df = pd.read_csv('../data/ml-100k/u.user', sep='|', names=user_column_names)
print('user_df shape: ',user_df.shape)
user_df.head()


user_df shape:  (943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
user_df = user_df.drop(columns = ['zip_code'])
user_df.head()

Unnamed: 0,user_id,age,sex,occupation
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other


## item 데이터

In [5]:
item_column_names = ['movie_id', 'movie_title' ,'release_date','video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

item_df = pd.read_csv('../data/ml-100k/u.item', sep='|', names=item_column_names, encoding='latin-1')
print('item_df shape: ', item_df.shape)
item_df.head()

item_df shape:  (1682, 24)


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
drop_columns_now = ['release_date','video_release_date','IMDb_URL']
item_df = item_df.drop(columns = drop_columns_now)
item_df.head()

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


## Score 데이터

In [3]:
score_df = pd.read_csv('../data/ml-100k/u.data', sep="\t", header=None)
score_df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
print('score_df shape: ',score_df.shape)
score_df.head()

score_df shape:  (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [22]:
score_df = score_df.sort_values(by=['user_id','timestamp'])

In [4]:
print('The number of user : ', len(score_df['user_id'].unique()))
print('The number of itme : ', len(score_df['item_id'].unique()))

The number of user :  943
The number of itme :  1682


In [5]:
943*1682

1586126

# Data Preprocessing - Ongoing
* Check users whether they have at least 20 interactions
* Create interaction column (if rating exists, then its interaction is 1)
* split train/test dataset for positive data (In this case, score_df is the positive data) according to leave-one-out
* Add negative data to test dataset (positive 1 : negative 99)

## Check users whether they have at least 20 interactions

In [6]:
check_interactions = score_df.groupby('user_id').count()
print(check_interactions.shape)
print('The number of users who have less than 20 interactions : ',
      check_interactions[check_interactions['rating']< 20].shape[0])

(943, 3)
The number of users who have less than 20 interactions :  0


Thus, in Movielens 100k, all users have at least 20 interactions.

## user-item mapping dicitonary

In [19]:
# user-item mapping dictionary
user_item_map = {}
user_ids = score_df['user_id'].unique()
for uid in user_ids:
    iids = score_df[score_df['user_id']==uid]['item_id'].tolist()
    iids.sort()
    user_item_map[uid] = iids

## Create interaction column (if rating exists, then its interaction is 1) 
If rating exists, then its interaction is 1. Thus, in score_df, all interaction is 1

In [12]:
score_df['interaction'] = 1
score_df['interaction'].value_counts()

1    100000
Name: interaction, dtype: int64

## Split positive train/test data according to leave-one-out

In [31]:
score_df = score_df.drop(['rating'], axis=1)
user_ids = score_df['user_id'].unique()
train_p = []
test_p = []

for uid in user_ids:
    uid_data = score_df[score_df['user_id']==uid][['user_id','item_id','interaction']].values
    train_p.extend(uid_data[:-1])
    test_p.append(uid_data[-1])

train_positive = pd.DataFrame(data = train_p, 
                              columns={'user_id': '', 'item_id': '','interaction': ''})
test_positive = pd.DataFrame(data = test_p, 
                             columns = {'user_id': '', 'item_id': '', 'interaction': ''})

In [33]:
print('train_positive shape: ', train_positive.shape)
train_positive.head()

train_positive shape:  (99057, 3)


Unnamed: 0,user_id,item_id,interaction
0,196,242,1
1,196,286,1
2,196,269,1
3,196,306,1
4,196,340,1


In [34]:
print('test_positive shape: ', test_positive.shape)
test_positive.head()

test_positive shape:  (943, 3)


Unnamed: 0,user_id,item_id,interaction
0,196,110,1
1,186,98,1
2,22,250,1
3,244,926,1
4,166,894,1


## Add negative data to test dataset (positive 1 : negative 99)

In [None]:
# TODO: use user_item map, binary search. 


# 모델 구조
reference : 

https://github.com/LeeHyeJin91/Neural_CF

https://github.com/hexiangnan/neural_collaborative_filtering


In [None]:
import tensorflow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

## GMF

In [None]:
# GMF - by Sunny
from tensorflow.keras.layers import *
def gmf_model(user_num,item_num):
    # input
    user_input = Input(1,)
    item_input = Input(1,)

    # embedding 
    user_embedding = Embedding(user_num, 32, name = 'user_embedding')
    item_embedding = Embedding(item_num, 32, name = 'item_embedding')

    # flatten
    user_flatten = Flatten(user_embedding)
    item_flatten = Flatten(item_embedding)

    # Merge
    x = Multiply([user_flatten,item_flatten])

    # output
    pred = Dense(1, activation = 'sigmoid',kernel_initializer='lecun_uniform', name='output_layer')(x)

    model = Model(input = [user_input,item_input], output=pred)
    model.compile(optimizer='adam', loss='binary_crossentropy')

## MLP

In [None]:
# MLP - by Sunny
def mlp_model(user_num,item_num):
    # input
    user_input = Input(1,)
    item_input = Input(1,)

    # embedding
    user_embedding = Embedding(user_num, 32, name = 'user_embedding')
    item_embedding = Embedding(item_num, 32, name = 'item_embedding')

    # flatten
    user_flatten = Flatten(user_embedding)
    item_flatten = Flatten(item_embedding)
    
    # Merge
    x = Concatenate([user_flatten, item_flatten])
    
    # Multi layers
    x = Dense(64, kernel_regularizer ='l2', activation = 'relu', name = 'layer1')(x)
    x = Dense(32, kernel_regularizer ='l2', activation = 'relu', name = 'layer2')(x)
    x = Dense(16, kernel_regularizer ='l2', activation = 'relu', name = 'layer3')(x)

    # output layer
    pred = Dense(1, activation = 'sigmoid',kernel_initializer='lecun_uniform', name='output_layer')(x)
    #lecun_uniform : [-limit, limit] 범위내 균등분포에 따라 샘플 생성

    model = Model(input = [user_input,item_input], output=pred)
    model.compile(optimizer='adam', loss='binary_crossentropy')

## NeuMF

In [None]:
def neumf_model(user_num,item_num):
    user_input = Input(1,)
    item_input = Input(1,)

    # gmf embedding
    gmf_user_embedding = Embedding(user_num, 32, name = 'gmf_user_embedding')
    gmf_item_embedding = Embedding(item_num, 32, name = 'gmf_item_embedding')

    # gmf flatten
    gmf_user_flatten = Flatten(gmf_user_embedding)
    gmf_item_flatten = Flatten(gmf_item_embedding)

    # gmf merge
    gmf_merge = Multiply([gmf_user_flatten,gmf_item_flatten])

    # mlp embedding
    mlp_user_embedding = Embedding(user_num, 32, name = 'mlp_user_embedding')
    mlp_item_embedding = Embedding(item_num, 32, name = 'mlp_item_embedding')

    # mlp flatten
    mlp_user_flatten = Flatten(mlp_user_embedding)
    mlp_item_flatten = Flatten(mlp_item_embedding)

    # mlp merge
    mlp_merge = concatenate([mlp_user_flatten, mlp_item_flatten])

    # mlp layers
    x = Dense(64, kernel_regularizer ='l2', activation = 'relu', name = 'mlp_layer1')(mlp_merge)
    x = Dense(32, kernel_regularizer ='l2', activation = 'relu', name = 'mlp_layer2')(x)
    x = Dense(16, kernel_regularizer ='l2', activation = 'relu', name = 'mlp_layer3')(x)

    # MLP + GMF
    mlp_gmf = concatenate([x, gmf_merge])

    # output layer
    output_layer = Dense(1, activation='sigmoid', kernel_initializer = 'lecun_uniform', name=output_layer)(mlp_gmf)

    # model
    model = Model([user_input,item_input], output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy')


# 평가 지표

In [None]:
# evaluation by sunny
def get_hr(rank_list,target):
    for item in rank_list:
        if item == target:
            return 1
    return 0

def get_ndcg(rank_list,target):
    for i in range(len(rank_list)):
        item = rank_list[i]
        if item == target:
            return math.log(2) / math.log(i+2)


In [None]:
import numpy as np
import heapq

class Metrix:
    def __init__(self):
        pass
    
    def get_hits(self, k_ranked, holdout):
        for item in k_ranked:
            if item == holdout:
                return 1
        return 0
    
    def eval_rating(self, idx, test_ratings, test_negatives, K, model):
        # holdout(df_Test의 item)이 k순위 내에 있는지 평가하는 함수
        items = test_negatives[idx]
        user_idx = test_ratings[idx][0]
        holdout = test_ratings[idx][1]
        items.append(holdout)
        
        # prediction
        predict_user = np.full(len(items), user_idx, dtype='int32').reshape(-1,1)
        np_items = np.array(items).reshape(-1,1)
        
        predictions = model.predict([predict_user, np_items])
        predictions = predictions.flatten().tolist()
        item_to_pre_score = {item:pre for item, pre in zip(items, predictions)}
        
        # 점수가 높은 상위 k개 아이템 리스트
        k_ranked = heapq.nlargest(K,item_to_pre_score, key=item_to_pre_score.get)
        
        # holdout이 상위 K순위에 포함되는지
        # {1: 포함, 0: 포함x}
        hits = self.get_hits(k_ranked, holdout)
        
        return hits
    
    def evaluate_top_k(self, df_neg, df_test, model, K=10):
        # Top- K metric을 사용해 모델 평가하는 함수
        hits = []
        test_u = df_test['user_id'].values.tolist()
        test_i = df_test['item_id'].values.tolist()
        
        test_ratings = list(zip(test_u, test_i))
        df_neg = df_neg.drop(df_neg.columns[0], axis=1)
        test_negatives = df_neg.values.tolist()
        
        # user 샘플링
        sample_idx_lst = np.random.choice(len(test_ratings), int(len(test_ratings)))
        for user_idx in sample_idx_lst:
            
            hitrate = self.eval_rating(user_idx, test_ratings, test_negatives, K, model)
            hits.append(hitrate)
            
        return hits
        