In [1]:
import random

import numpy as np
import pandas as pd
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
%pip install ml_metrics
from ml_metrics import mapk
from scipy.sparse import csr_matrix
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
%pip install implicit
from implicit.als import AlternatingLeastSquares

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
print(torch.__version__)

%pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.11.0+cu113.html
from torch_geometric.data import Data
from torch_geometric.nn.models import LightGCN

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ml_metrics
  Downloading ml_metrics-0.1.4.tar.gz (5.0 kB)
Building wheels for collected packages: ml-metrics
  Building wheel for ml-metrics (setup.py) ... [?25l[?25hdone
  Created wheel for ml-metrics: filename=ml_metrics-0.1.4-py3-none-any.whl size=7845 sha256=283dd497a7443d0042c7bc5dee073af80d2bf601e63985c828f0a5b146e6502b
  Stored in directory: /root/.cache/pip/wheels/56/41/5b/0c6d42b3604a5c823d8922564c4708f84962fa7f2f4facfa6d
Successfully built ml-metrics
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.5.2-cp37-cp37m-manylinux2014_x86_64.whl (18.5 MB)
[K     |████████████████████████████████| 18.5 MB 2.0 MB/s 
Installing collected packages: implicit
Successfully installed implicit-0.5.2


  f"CUDA extension is built, but disabling GPU support because of '{e}'",


1.11.0+cu113
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.11.0+cu113.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 3.0 MB/s 
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_sparse-0.6.13-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 35.1 MB/s 
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_cluster-1.6.0-cp37-cp37m-linux_x86_64.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 57.6 MB/s 
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_spline_conv-1.2.1-cp37-cp37m-linux_x86_64.whl (750 kB)
[K     |████████████████████████████████| 750

In [2]:
def preprocess (df_for_cb, df_for_cf):
    df_for_cb.drop(columns=['res_name'], inplace=True)

    # address 정보 - one hot encoding
    address_one_hot = pd.get_dummies(df_for_cb['addr'])
    df_for_cb.drop(columns=['addr'], inplace=True)
    df_for_cb = pd.concat([df_for_cb, address_one_hot], axis=1)

    # category, main_menu 정보는 어떻게 벡터화할 지 나중에 생각 - 일단 삭제 
    df_for_cb.drop(columns=['category'], inplace=True)

    # 각 attribute별 scale 차이가 너무 심함 - normalize
    scaler = MinMaxScaler()
    data_for_cb = scaler.fit_transform(df_for_cb)
    
    ######################################################
    
    user_list = list(set(df_for_cf['user_name'].tolist()))
    item_list = list(set(df_for_cf['res_name'].tolist()))

    user2idx = {k: v for v, k in enumerate(user_list)}
    item2idx = {k: v for v, k in enumerate(item_list)}

    idx2user = {k: v for k, v in enumerate(user_list)}
    idx2item = {k: v for k, v in enumerate(item_list)}

    # rating scale 조정: 1 ~ 5 -> -1 ~ 1
    # data = df_for_cf["rating"].apply(lambda x: (float(x[0]) - 3) / 2).tolist()
    data = df_for_cf["rating"].apply(lambda x: 1.0 if x == "5점" else 0.0).tolist()
    row = df_for_cf["user_name"].apply(lambda x: user2idx.get(x))
    col = df_for_cf["res_name"].apply(lambda x: item2idx.get(x))
    data_for_cf = csr_matrix((data, (row, col)))

    return data_for_cb, data_for_cf

In [3]:
def train_test_split(data_for_cf, test_size):
    data_splitter = np.random.binomial(n=1, p=1-test_size, size=data_for_cf.shape)
    train_data_for_cf = np.multiply(data_for_cf, data_splitter)
    test_data_for_cf = np.multiply(data_for_cf, data_splitter^1)

    return csr_matrix(train_data_for_cf), csr_matrix(test_data_for_cf)

In [4]:
class ContentBasedFiltering:
    def __init__(self, item_feature, user_item_matrix):
        self.item_feature = item_feature
        self.ui_matrix = np.array(user_item_matrix.todense())
        self.similarity_matrix = cosine_similarity(self.item_feature)
        
    def recommend(self, n_recommendations):
        # User가 Positive Feedback을 표시한 아이템과 유사한 아이템 추천 (Feedback이 없는 경우 - Random 추천)
        n_users = self.ui_matrix.shape[0]
        n_items = self.ui_matrix.shape[1]
        recommendations = []
        for i in range(n_users):
            user_feedback = np.argwhere(self.ui_matrix[i] > 0).flatten().tolist()
            
            if len(user_feedback) == 0:
                recommendations.append([random.randint(0, n_items) for _ in range(n_recommendations)])
                continue
            similarities = np.mean((self.ui_matrix[i] > 0) * self.similarity_matrix, axis=1)
            
            recommendation = []
            recommendation_candidates = np.argsort(similarities)
            idx = 0
            while len(recommendation) < n_recommendations:
                candidate = recommendation_candidates[idx]
                if (candidate not in user_feedback):
                    recommendation.append(candidate)
                idx += 1
            recommendations.append(recommendation)
        return recommendations

In [5]:
class CollaborativeFiltering:
    def __init__(self, user_item_matrix):
        self.ui_matrix = user_item_matrix
        self.model = AlternatingLeastSquares(factors=10)
        self.model.fit(self.ui_matrix)
        
    def recommend(self, n_recommendations):
        n_users = self.ui_matrix.shape[0]
        recommendations = []
        for i in range(n_users):
            recommendations.append(self.model.recommend(i, self.ui_matrix[i])[0][:n_recommendations].tolist())
        return recommendations

In [6]:
def negative_sampling (matrix, pos_edge_index):
    matrix = matrix.todense()
    n_users = matrix.shape[0]
    n_items = matrix.shape[1]

    edge_index = []
    for user in pos_edge_index[0]:
        item = random.randint(n_users, n_users + n_items - 1)
        while (matrix[user, item - n_users] == 1.0):
            item = random.randint(n_users, n_users + n_items - 1)
        edge_index.append((user, item))
    return torch.LongTensor(edge_index).T

class GraphBasedRecommendation:
    def __init__(self, item_feature, user_item_matrix):
        self.item_feature = item_feature
        self.ui_matrix = user_item_matrix
        self.lightgcn = LightGCN(self.ui_matrix.shape[0] + self.ui_matrix.shape[1], embedding_dim=item_feature.shape[1], num_layers=3)
        self.loss_list = []

    def recommend(self, n_recommendations):
        n_users = self.ui_matrix.shape[0]
        n_items = self.ui_matrix.shape[1]
        # Graph 초기 임베딩으로 Content-Based Filtering에서 사용하던 Feature 사용
        # with torch.no_grad():
            # self.lightgcn.embedding.weight[n_users:] = torch.FloatTensor(self.item_feature)

        # User-Item Matrix를 가지고 User, Item 임베딩 학습
        row, col = self.ui_matrix.nonzero()
        pos_edge_index = []
        pos_edge_index.append(row)
        pos_edge_index.append([i + n_users for i in col])
        pos_edge_index = torch.LongTensor(pos_edge_index)

        optimizer = optim.Adam(self.lightgcn.parameters(), lr=1e-3)
        for _ in tqdm(range(1000)):
            pos_rank = self.lightgcn(pos_edge_index)
            neg_edge_index = negative_sampling(self.ui_matrix, pos_edge_index)
            neg_rank = self.lightgcn(neg_edge_index)

            # print(pos_edge_index[:10])
            # print(neg_edge_index[:10])
            # print(kyle)

            optimizer.zero_grad()
            loss = self.lightgcn.recommendation_loss(pos_rank, neg_rank)
            loss.backward()
            optimizer.step()
            self.loss_list.append(loss.item())

        src_index = torch.from_numpy(np.arange(n_users))
        dst_index = torch.from_numpy(np.arange(n_users, n_users + n_items))
        recommendations = (self.lightgcn.recommend(pos_edge_index, src_index, dst_index, k=n_recommendations) - n_users).tolist()
        return recommendations

In [7]:
def evaluate(test_data, recommend, method):
    actual_list = []
    predict_list = []
    for i in range(len(recommend)):
        actual = np.argwhere(np.array(test_data.todense())[i] == 1.0).flatten().tolist()
        if len(actual) == 0:
            continue
        else:
            actual_list.append(actual)
            predict_list.append(recommend[i])

    if method == "mapk":
        return mapk(actual_list, predict_list, k=10)
    
    if method == "ndcg":
        return ndcg(actual_list, predict_list, k=10)

def ndcg(actual_list, predict_list, k):
    ndcg_list = []
    for i in range(len(predict_list)):
        dcg = 0  
        for j, pred in enumerate(predict_list[i]):
            if pred in actual_list[i]:
                dcg += (1) / math.log2(j + 1 + 1)
        idcg = 0
        for j in range(k):
            idcg += (1) / math.log2(j + 1 + 1)
        ndcg = dcg / idcg
        ndcg_list.append(ndcg)

    return sum(ndcg_list) / len(ndcg_list)

        

In [9]:
random.seed(42)
np.random.seed(42)
torch.random.seed(42)
# 1. Get Raw Data
df_for_cb = pd.read_csv('./Kakaomap_CB_final.csv', encoding="cp949")
df_for_cf = pd.read_csv('./Kakaomap_CF_final.csv', encoding="cp949")

# 2. Preprocessing Data
data_for_cb, data_for_cf = preprocess(df_for_cb, df_for_cf)

# 3. Train-Test Data Split
train_data_for_cf, test_data_for_cf = train_test_split(data_for_cf.todense(), test_size=0.2)
print(data_for_cb.shape, data_for_cf.shape)
# 4. Build Models
content_based = ContentBasedFiltering(data_for_cb, train_data_for_cf)
collaborative = CollaborativeFiltering(train_data_for_cf)
graph_based = GraphBasedRecommendation(data_for_cb, train_data_for_cf)

# 5. Recommendation
cb_recommend = content_based.recommend(n_recommendations=10)
cf_recommend = collaborative.recommend(n_recommendations=10)
gb_recommend = graph_based.recommend(n_recommendations=10)

# 6. Evaluation
cb_mapk = evaluate(test_data_for_cf, cb_recommend, method='mapk')
cf_mapk = evaluate(test_data_for_cf, cf_recommend, method="mapk")
gb_mapk = evaluate(test_data_for_cf, gb_recommend, method="mapk")
print(f"MAP@K of Content-Based Filtering: \t {cb_mapk}")
print(f"MAP@K of Collaborative Filtering: \t {cf_mapk}")
print(f"MAP@K of Graph-Based Recommendation: \t {gb_mapk}")

(485, 7) (2317, 485)


  "OpenBLAS detected. Its highly recommend to set the environment variable "


  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:35<00:00, 28.06it/s]


MAP@K of Content-Based Filtering: 	 0.011843209265219663
MAP@K of Collaborative Filtering: 	 0.012292188202066885
MAP@K of Graph-Based Recommendation: 	 0.012047123875546751


In [10]:
# 6. Evaluation - ndcg
cb_ndcg = evaluate(test_data_for_cf, cb_recommend, method='ndcg')
cf_ndcg = evaluate(test_data_for_cf, cf_recommend, method="ndcg")
gb_ndcg = evaluate(test_data_for_cf, gb_recommend, method="ndcg")
print(f"NDCG of Content-Based Filtering: \t {cb_ndcg}")
print(f"NDCG of Collaborative Filtering: \t {cf_ndcg}")
print(f"NDCG of Graph-Based Recommendation: \t {gb_ndcg}")

NDCG of Content-Based Filtering: 	 0.0039052568700199426
NDCG of Collaborative Filtering: 	 0.004361909260586901
NDCG of Graph-Based Recommendation: 	 0.004199696966448696


In [None]:
plt.plot(graph_based.loss_list)

In [13]:
class RandomRecommendation:
    def __init__(self, user_item_matrix):
        self.ui_matrix = user_item_matrix
        
    def recommend(self, n_recommendations):
        n_users = self.ui_matrix.shape[0]
        n_items = self.ui_matrix.shape[1]
        recommendations = []
        for i in range(n_users):
            recommendations.append([random.randint(0, n_items-1) for _ in range(n_recommendations)])
        return recommendations

In [14]:
rd = RandomRecommendation(train_data_for_cf)
rd_recommend = rd.recommend(n_recommendations=10)
rd_mapk = evaluate(test_data_for_cf, rd_recommend, method="mapk")
print(f"MAP@K of Random Recommendation: \t {rd_mapk}")
rd_ndcg = evaluate(test_data_for_cf, rd_recommend, method="ndcg")
print(f"NDCG of Random Recommendation: \t {rd_ndcg}")

MAP@K of Random Recommendation: 	 0.0037751368600588706
NDCG of Random Recommendation: 	 0.001892792547772481


In [44]:
# Qualitive Analysis
for user, user_feedback in enumerate(test_data_for_cf):
    item_list = []
    for itemid, feedback in enumerate(user_feedback):
        if feedback == 1:
            item_list.append(itemid)
            if len(item_list) >= 10 :
                print(user, item_list)
                break

1392 [2, 30, 50, 69, 121, 170, 195, 339, 351, 471]


In [27]:
user_list = list(set(df_for_cf['user_name'].tolist()))
item_list = list(set(df_for_cf['res_name'].tolist()))

user2idx = {k: v for v, k in enumerate(user_list)}
item2idx = {k: v for v, k in enumerate(item_list)}

idx2user = {k: v for k, v in enumerate(user_list)}
idx2item = {k: v for k, v in enumerate(item_list)}

In [45]:
print("What User Really Likes")
for itemid in [2, 30, 50, 69, 121, 170, 195, 339, 351, 471]:
    print(idx2item[itemid])
print("---------------------------------")
print("Recommendation of CB Algorithm")
for itemid in cb_recommend[1392]:
    print(idx2item[itemid])
print("---------------------------------")
print("Recommendation of CF Algorithm")
for itemid in cf_recommend[1392]:
    print(idx2item[itemid])
print("---------------------------------")
print("Recommendation of Graph-Based Algorithm")
for itemid in gb_recommend[1392]:
    print(idx2item[itemid])

What User Really Likes
파운드
무이
그린베이커리
동방명주 동천홍2호점
예담추어정
설천순대국밥 유성직영점
수정삼겹살
성심당 DCC점
카페시은우
무공돈까스 대전둔산점
---------------------------------
Recommendation of CB Algorithm
화이트무스
텀즈업브로
착한참치 본점
연탄구이
대선칼국수
놀부네집
맥도날드 대전유성DT점
수통골장수오리
드르쿰다 나인스테이
설해돈 둔산본점
---------------------------------
Recommendation of CF Algorithm
복수분식 본점 (대흥동)
구들마루
개천식당
복사꽃피는집 대전점
스바라시라멘 본점
수통골감나무집 본점
화이트무스
수통골장수오리
하레하레 도안점
더함뜰
---------------------------------
Recommendation of Graph-Based Algorithm
오씨칼국수 도룡점
디블루메
도레미아구찜
성심당 대전역점
사리원 본점
오씨칼국수 (원동)
구름식당
우사미 대전본점
꽁뚜식당
임프레션커피컴퍼니
