In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

import warnings

warnings.filterwarnings(action='ignore')

DATA_DIR = '/content/drive/MyDrive/투빅스 컨퍼런스/Data/'
MODEL_DIR = '/content/drive/MyDrive/투빅스 컨퍼런스/Model/'

# 데이터 불러오기

In [None]:
import pickle

# 이미지 + 순서 + 장르(텍스트)
with open(DATA_DIR + 'norm_game2vec.pickle', 'rb') as f:
  norm_game2vec = pickle.load(f)

In [None]:
all_df = pd.read_csv(DATA_DIR + 'steam_reviews_clean_result_v2.csv')
train_df = pd.read_csv(DATA_DIR + 'steam_reviews_clean_result_train_v2.csv')
val_df = pd.read_csv(DATA_DIR + 'steam_reviews_clean_result_val_v2.csv')
test_df = pd.read_csv(DATA_DIR + 'steam_reviews_clean_result_test_v2.csv')

# 데이터 전처리

In [None]:
(train_df.shape[0] + val_df.shape[0] + test_df.shape[0]) == (all_df.shape[0])

True

In [None]:
a = all_df['user_id'].astype(str).unique().tolist()
b = train_df['user_id'].astype(str).unique().tolist()

a = set(a)
b = set(b)

b - a

set()

In [None]:
all_df['recommended'].value_counts() / sum(all_df['recommended'].value_counts())

Recommended        0.762219
Not Recommended    0.237781
Name: recommended, dtype: float64

In [None]:
train_df['recommended'].value_counts() / sum(train_df['recommended'].value_counts())

Recommended        0.764251
Not Recommended    0.235749
Name: recommended, dtype: float64

In [None]:
val_df['recommended'].value_counts() / sum(val_df['recommended'].value_counts())

Recommended        0.751932
Not Recommended    0.248068
Name: recommended, dtype: float64

In [None]:
test_df['recommended'].value_counts() / sum(test_df['recommended'].value_counts())

Recommended        0.763636
Not Recommended    0.236364
Name: recommended, dtype: float64

In [None]:
all_df['label'] = all_df['recommended'].apply(lambda x : 0 if x == 'Not Recommended' else 1)
train_df['label'] = train_df['recommended'].apply(lambda x : 0 if x == 'Not Recommended' else 1)
val_df['label'] = val_df['recommended'].apply(lambda x : 0 if x == 'Not Recommended' else 1)
test_df['label'] = test_df['recommended'].apply(lambda x : 0 if x == 'Not Recommended' else 1)

In [None]:
train_df.head()

Unnamed: 0,content,recommended,user_id,game_id,timestamp,play_time_minute,review_time_minute,game_content_link,review_helpful_count,label_encode_user_id,label_encode_game_id,idx,label
0,"Game is dead, installed, can play training. Bu...",Not Recommended,atgbui,728540,2021-05-19,18.0,18.0,https://store.steampowered.com/app/728540,0,19157,14662,6,0
1,Adjust settings- mouse disappears (invisible) ...,Not Recommended,atgbui,999270,2021-05-19,6.0,6.0,https://store.steampowered.com/app/999270,1,19157,18652,7,0
2,I wish there was a neutral review button.The g...,Not Recommended,atgbui,920470,2021-05-16,216.0,216.0,https://store.steampowered.com/app/920470,0,19157,17716,8,0
3,"The game brings something new to the genre, an...",Not Recommended,atgbui,1183940,2021-05-09,738.0,738.0,https://store.steampowered.com/app/1183940,0,19157,20641,9,0
4,Update: 06/11/2021: Game seems to be abandoned...,Not Recommended,atgbui,1254400,2021-05-09,438.0,438.0,https://store.steampowered.com/app/1254400,1,19157,21185,10,0


In [None]:
train_user_id_idx_li = train_df['label_encode_user_id'].tolist()
train_game_id_idx_li = train_df['label_encode_game_id'].tolist()
train_label_li = train_df['label'].astype(float).tolist()

val_user_id_idx_li = val_df['label_encode_user_id'].tolist()
val_game_id_idx_li = val_df['label_encode_game_id'].tolist()
val_label_li = val_df['label'].astype(float).tolist()

test_user_id_idx_li = test_df['label_encode_user_id'].tolist()
test_game_id_idx_li = test_df['label_encode_game_id'].tolist()
test_label_li = test_df['label'].astype(float).tolist()

In [None]:
torch_norm_game2vec = torch.FloatTensor(norm_game2vec)

# 모델

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class NCFData(Dataset):
    def __init__(self, user_id_idx_li, game_id_idx_li, label_li):
      super(NCFData, self).__init__()
      self._user_id_idx_li = user_id_idx_li
      self._game_id_idx_li = game_id_idx_li
      self._label_li = label_li

    def __len__(self):
      return len(self._label_li)

    def __getitem__(self, idx):
      user = self._user_id_idx_li[idx]
      game = self._game_id_idx_li[idx]
      label = self._label_li[idx]

      return user, game, label

In [None]:
user_num = all_df['label_encode_user_id'].max() + 1 
factor_num = norm_game2vec.shape[1]

In [None]:
# 환경설정
if torch.cuda.is_available():
  DEVICE = torch.device('cuda')
else:
  DEVICE = torch.device('cpu')
print(DEVICE)

cuda


In [None]:
BATCH_SIZE = 512
EPOCHS = 30

In [None]:
class GMF(nn.Module):
  def __init__(self, user_num, factor_num):
      super(GMF, self).__init__()

      # 유저 임베딩
      self.user_embedding = nn.Embedding(user_num, factor_num)

      # FC
      self.FC_layer = nn.Sequential(
          nn.Linear(factor_num, 1),
          # nn.ReLU(),
          # nn.Linear(256, 128),
          # nn.ReLU(),
          # nn.Linear(128, 1),
          nn.Sigmoid()
      )
      self._init_weight_()

  def _init_weight_(self):
      # weight 초기화
      nn.init.normal_(self.user_embedding.weight, std=0.01)
      for m in self.FC_layer:
          if isinstance(m, nn.Linear):
              nn.init.xavier_uniform_(m.weight)

  def forward(self, user_idx, item_embedding):
      user_embedding = self.user_embedding(user_idx)

      element_wise_product = (user_embedding * item_embedding)

      out = self.FC_layer(element_wise_product)

      return out.view(-1)

In [None]:
from sklearn.metrics import roc_auc_score

def roc_auc_compute_fn(y_pred, y_true):

  return roc_auc_score(y_true, y_pred)

from sklearn.metrics import f1_score

def f1_score_compute_fn(y_pred, y_true):

  return f1_score(y_true, y_pred)

def train(model, train_loader):
  model.train()
  train_loss = 0
  correct = 0

  auc_output_li = []
  acc_output_li = []
  label_li = []

  for user, game, label in train_loader:
    user = user.to(DEVICE)
    item_embedding = torch_norm_game2vec[game].to(DEVICE)
    label = label.to(DEVICE)
    label = label.type(torch.DoubleTensor)

    optimizer.zero_grad()

    output = model(user, item_embedding)
    output = output.type(torch.DoubleTensor)

    loss = criterion(output, label)

    loss.backward()
    optimizer.step()

    train_loss += loss.item()

    auc_output_li.append(output.detach().cpu().numpy())

    output = (output > 0.5).float()
    correct += (output == label).float().sum()

    acc_output_li.append(output.detach().cpu().numpy())
    label_li.append(label.detach().cpu().numpy())

  auc_output_li = np.concatenate(auc_output_li)
  acc_output_li = np.concatenate(acc_output_li)
  label_li = np.concatenate(label_li)

  train_loss /= len(train_loader)
  train_accuracy = 100. * correct / len(train_loader.dataset)
  train_auc = roc_auc_compute_fn(auc_output_li, label_li)
  train_f1 = f1_score_compute_fn(acc_output_li, label_li)

  return train_loss, train_accuracy, train_auc, train_f1

In [None]:
def evaluate(model, test_loader):
  model.eval()
  test_loss = 0
  correct = 0

  auc_output_li = []
  acc_output_li = []
  label_li = []

  with torch.no_grad():
    for user, game, label in test_loader:

      user = user.to(DEVICE)
      item_embedding = torch_norm_game2vec[game].to(DEVICE)
      label = label.to(DEVICE)
      label = label.type(torch.DoubleTensor)

      output = model(user, item_embedding)
      output = output.type(torch.DoubleTensor)

      loss = criterion(output, label)

      test_loss += loss.item()

      auc_output_li.append(output.detach().cpu().numpy())

      output = (output>0.5).float()
      correct += (output == label).float().sum()

      acc_output_li.append(output.detach().cpu().numpy())
      label_li.append(label.detach().cpu().numpy())

  auc_output_li = np.concatenate(auc_output_li)
  acc_output_li = np.concatenate(acc_output_li)
  label_li = np.concatenate(label_li)

  test_loss /= len(test_loader)
  test_accuracy = 100. * correct / len(test_loader.dataset)
  test_auc = roc_auc_compute_fn(auc_output_li, label_li)
  test_f1 = f1_score_compute_fn(acc_output_li, label_li)

  return test_loss, test_accuracy, test_auc, test_f1

In [None]:
model = GMF(user_num = user_num, factor_num = factor_num).to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

train_dataset = NCFData(train_user_id_idx_li, train_game_id_idx_li, train_label_li)

train_loader = DataLoader(train_dataset,
  batch_size = BATCH_SIZE,
  shuffle = True,
  drop_last = False)

val_dataset = NCFData(val_user_id_idx_li, val_game_id_idx_li, val_label_li)

val_loader = DataLoader(val_dataset,
  batch_size = BATCH_SIZE,
  shuffle = False,
  drop_last = False)

best_metric = 0
best_epoch = 0

for epoch in range(1, EPOCHS + 1):
  train_loss, train_accuracy, train_auc, train_f1 = train(model, train_loader)
  test_loss, test_accuracy, test_auc, test_f1 = evaluate(model, val_loader)
  print(f"[EPOCH: {epoch}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f} %, Train F!-Score: {train_f1:.4f}, Train AUC: {train_auc:.4f}, \
  Val Loss: {test_loss:.4f}, Val Accuracy: {test_accuracy:.2f} %, Val F!-Score: {test_f1:.4f}, Val AUC: {test_auc:.4f} \n")

  if best_metric < test_auc:

    best_metric = test_auc
    best_epoch = epoch
    MODEL_DIR = '/content/drive/MyDrive/투빅스 컨퍼런스/Model/'
    torch.save(model.state_dict(), MODEL_DIR + f'GMF_Best_model_state_dict.pt')

# Train Accuracy - Val Accuracy 를 해서 가장 작은 모델을 저장하는 것이 좋들 듯
# 왜? Val 에는 1만 존재하기 때문에 Train의 유저의 선호도를 어느 정도 맞춘 상태에서
# Val의 선호 게임을 맞추는 것이 조금 더 타당할 수 있음
# 따라서 Train Accuracy - Val Accuracy의 편차가 가장 작은 모델이 가장 성능이 우수하다고 생각함

# 현재로써는 AUC 를 메트릭으로 사용해도 좋을 거 같음

[EPOCH: 1], Train Loss: 0.6688, Train Accuracy: 75.25 %, Train F!-Score: 0.8576, Train AUC: 0.5531,   Val Loss: 0.6429, Val Accuracy: 75.69 %, Val F!-Score: 0.8608, Val AUC: 0.7691 

[EPOCH: 2], Train Loss: 0.6073, Train Accuracy: 77.17 %, Train F!-Score: 0.8700, Train AUC: 0.7455,   Val Loss: 0.5869, Val Accuracy: 76.26 %, Val F!-Score: 0.8635, Val AUC: 0.7818 

[EPOCH: 3], Train Loss: 0.5460, Train Accuracy: 77.69 %, Train F!-Score: 0.8725, Train AUC: 0.8399,   Val Loss: 0.5408, Val Accuracy: 76.67 %, Val F!-Score: 0.8655, Val AUC: 0.7913 

[EPOCH: 4], Train Loss: 0.4946, Train Accuracy: 78.13 %, Train F!-Score: 0.8746, Train AUC: 0.8739,   Val Loss: 0.5065, Val Accuracy: 77.03 %, Val F!-Score: 0.8672, Val AUC: 0.8004 

[EPOCH: 5], Train Loss: 0.4536, Train Accuracy: 78.63 %, Train F!-Score: 0.8770, Train AUC: 0.8945,   Val Loss: 0.4819, Val Accuracy: 77.45 %, Val F!-Score: 0.8692, Val AUC: 0.8087 

[EPOCH: 6], Train Loss: 0.4202, Train Accuracy: 79.39 %, Train F!-Score: 0.8808, Trai

In [None]:
print(f'Best_Epoch : {best_epoch}, Best_Metric : {best_metric}')

Best_Epoch : 28, Best_Metric : 0.8661161769662891


In [None]:
test_dataset = NCFData(test_user_id_idx_li, test_game_id_idx_li, test_label_li)

test_loader = DataLoader(test_dataset,
  batch_size = BATCH_SIZE,
  shuffle = False,
  drop_last = False)

model.load_state_dict(torch.load(MODEL_DIR + f'GMF_Best_model_state_dict.pt'))

test_loss, test_accuracy, test_auc, test_f1 = evaluate(model, test_loader)

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f} %, Test F!-Score: {test_f1:.4f}, Test AUC: {test_auc:.4f}")

Test Loss: 0.3998, Test Accuracy: 83.62 %, Test F!-Score: 0.8986, Test AUC: 0.8526
