In [77]:
import pandas as pd
import numpy as np
import os
from surprise import Dataset
from surprise import Reader
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.backends.cudnn as cudnn
import wandb



In [78]:
# dataset 로드

movie_dataset = Dataset.load_builtin('ml-100k')

df = pd.DataFrame(movie_dataset.raw_ratings)
df.columns = ['user', 'item', 'rating', 'timestamp']

In [79]:
# 데이터 확인
print(df.head())
print(df.info())
print(df.describe())

  user item  rating  timestamp
0  196  242     3.0  881250949
1  186  302     3.0  891717742
2   22  377     1.0  878887116
3  244   51     2.0  880606923
4  166  346     1.0  886397596
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user       100000 non-null  object 
 1   item       100000 non-null  object 
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 3.1+ MB
None
              rating
count  100000.000000
mean        3.529860
std         1.125674
min         1.000000
25%         3.000000
50%         4.000000
75%         4.000000
max         5.000000


In [80]:
# import zipfile

# zipfileName = 'ml-latest-small.zip'

# with zipfile.ZipFile(zipfileName, 'r') as zip_ref:
#     zip_ref.extractall()


In [81]:
# 데이터 로드

movie_df = pd.read_csv(os.path.join('ml-latest-small','movies.csv'))
rating_df = pd.read_csv(os.path.join('ml-latest-small','ratings.csv'))
tag_df = pd.read_csv(os.path.join('ml-latest-small','tags.csv'))


## 데이터 확인

In [82]:
# 데이터 확인
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [83]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [84]:
tag_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### Item-based CF

In [85]:
# config

config = {

}

args = {
    "batch_size": 128,
    "epochs": 80,
    "lr": 0.0001,
    "weight_decay": 0.0001,
    "dropout": 0.1,
    "num_layers": 3,
    "hidden_size": 256,
    "topk": 10,
    "gpu": "0",
    "step_size": 10,
    "gamma": 0.5,
}

os.environ["CUDA_VISIBLE_DEVICES"] = args["gpu"]
cudnn.benchmark = True

In [86]:
# 데이터셋 정의

class MovieDataset(data.Dataset):
    def __init__(self, df, is_training=True):
        super(MovieDataset, self).__init__()
        self.dataset = np.array(df)
        self.is_training = is_training

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # user의 경우 1부터 시작하므로 1을 빼준다.
        user = (self.dataset[idx][0] - 1).astype(np.int64)
        item = (self.dataset[idx][1] - 1).astype(np.int64)
        rating = self.dataset[idx][2].astype(np.float32)

        return user, item, rating
    
    def collate_fn(self, data):
        users, items, ratings = zip(*data)
        users = torch.LongTensor(users)
        items = torch.LongTensor(items)
        ratings = torch.FloatTensor(ratings)
        return users, items, ratings.dtype(torch.float32)
        
    
    def get_loader(self, batch_size, shuffle=True, num_workers=4):
        if self.is_training:
            return data.DataLoader(self, batch_size=batch_size, shuffle=False, num_workers=num_workers)
        else:
            return data.DataLoader(self, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
    

In [87]:
# train, test split

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(rating_df, test_size=0.1, random_state=42)
traind_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

train_df.head()


Unnamed: 0,userId,movieId,rating,timestamp
14430,91,2431,2.0,1112716724
43498,292,111,2.0,1265680159
73590,474,1784,3.5,1053021455
19181,124,110,3.5,1336584326
97254,605,4899,2.5,1277176881


In [88]:
train_X = MovieDataset(train_df)
val_X = MovieDataset(val_df, is_training=False)
test_X = MovieDataset(test_df, is_training=False)

train_loader = train_X.get_loader(args["batch_size"])
val_loader = val_X.get_loader(args["batch_size"])
test_loader = test_X.get_loader(args["batch_size"])

for i in range(3):
    for users, items, ratings in train_loader:
        print(users[0], items[0], ratings[0])
        break

for i in range(3):
    for users, items, ratings in test_loader:
        print(users[0], items[0], ratings[0])
        break

tensor(90) tensor(2430) tensor(2.)
tensor(90) tensor(2430) tensor(2.)
tensor(90) tensor(2430) tensor(2.)
tensor(81) tensor(1543) tensor(3.5000)
tensor(109) tensor(2467) tensor(3.)
tensor(317) tensor(5217) tensor(3.5000)


In [89]:
# 모델 정의
from ncfModel import MovieModel
        

In [90]:
num_users = len(rating_df["userId"].unique())
num_items = rating_df["movieId"].max()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 모델 생성
model = MovieModel(num_users, num_items, dropout=args["dropout"], num_layers=args["num_layers"], hidden_size=args["hidden_size"])
model = model.to(device)

# 모델 확인
print(model)



MovieModel(
  (user_embedding): Embedding(610, 40)
  (item_embedding): Embedding(193609, 40)
  (layers): ModuleList(
    (0): Linear(in_features=80, out_features=256, bias=True)
    (1-2): 2 x Linear(in_features=256, out_features=256, bias=True)
    (3): Linear(in_features=256, out_features=1, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
)


In [91]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mgmk0904[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [92]:
# 모델 학습

def train(model, train_loader, optimizer, epoch):
    model.train()
    train_loss = 0
    for users, items, ratings in train_loader:
        users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
        
        optimizer.zero_grad()
        loss = model.get_loss(users, items, ratings)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
    train_loss /= len(train_loader)
    print("Epoch : {}, Train Loss : {:.4f}".format(epoch, train_loss))
    wandb.log({"train_loss": train_loss})

# 모델 평가

def evaluate(model, val_loader, epoch):
    model.eval()
    test_loss = 0
    
    with torch.no_grad():
        for users, items, ratings in val_loader:
            users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
            
            loss = model.get_loss(users, items, ratings)
            test_loss += loss.item()

    test_loss /= len(val_loader)
    print("Epoch : {}, Test Loss : {:.4f}".format(epoch, test_loss))
    wandb.log({"test_loss": test_loss})


In [93]:
print(len(train_loader))
print(len(val_loader))
print(len(test_loader))

709
71
79


In [94]:
# wandb 설정
wandb.init(
    project="cocktailRecSys",
    name="ncf-adam-SLR-drop0.2",
    config=args
)

wandb.watch(model)
wandb.config.update(args)

# 모델 학습 ReLU를 적용하면 성능 하락
optimizer = torch.optim.Adam(model.parameters(), lr=args["lr"], weight_decay=args["weight_decay"])
#optimizer = torch.optim.SGD(model.parameters(), lr=args["lr"], weight_decay=args["weight_decay"])
#optimizer = torch.optim.Adagrad(model.parameters(), lr=args["lr"], weight_decay=args["weight_decay"])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args["step_size"], gamma=args["gamma"])
#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0, last_epoch=-1)

for epoch in range(args["epochs"]):
    train(model, train_loader, optimizer, epoch)
    evaluate(model, val_loader, epoch)
    scheduler.step()

wandb.finish()


Epoch : 0, Train Loss : 2.0895
Epoch : 0, Test Loss : 1.0472
Epoch : 1, Train Loss : 1.2011
Epoch : 1, Test Loss : 1.0331
Epoch : 2, Train Loss : 1.1411
Epoch : 2, Test Loss : 1.0167
Epoch : 3, Train Loss : 1.1061
Epoch : 3, Test Loss : 1.0060
Epoch : 4, Train Loss : 1.0784
Epoch : 4, Test Loss : 0.9969
Epoch : 5, Train Loss : 1.0573
Epoch : 5, Test Loss : 0.9813
Epoch : 6, Train Loss : 1.0387
Epoch : 6, Test Loss : 0.9687
Epoch : 7, Train Loss : 1.0191
Epoch : 7, Test Loss : 0.9555
Epoch : 8, Train Loss : 1.0021
Epoch : 8, Test Loss : 0.9439
Epoch : 9, Train Loss : 0.9889
Epoch : 9, Test Loss : 0.9305
Epoch : 10, Train Loss : 0.9711
Epoch : 10, Test Loss : 0.9189
Epoch : 11, Train Loss : 0.9683
Epoch : 11, Test Loss : 0.9133
Epoch : 12, Train Loss : 0.9579
Epoch : 12, Test Loss : 0.9078
Epoch : 13, Train Loss : 0.9511
Epoch : 13, Test Loss : 0.9035
Epoch : 14, Train Loss : 0.9494
Epoch : 14, Test Loss : 0.8959
Epoch : 15, Train Loss : 0.9395
Epoch : 15, Test Loss : 0.8899
Epoch : 16, 

0,1
test_loss,█▇▆▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
test_loss,0.81786
train_loss,0.85903


In [72]:
# 모델 테스트
def test(model, test_loader):
    model.eval()
    test_loss = 0
    total = 0
    
    with torch.no_grad():
        for users, items, ratings in test_loader:
            users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
            
            loss = model.get_loss(users, items, ratings)
            test_loss += loss.item()
        
    test_loss /= len(test_loader)

    return test_loss

In [73]:
test_loss = test(model, test_loader)

test_loss

1.0345077350735665

In [74]:
for users, items, ratings in test_loader:
    users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
    
    predict = model.predict(users, items)
    rmse = torch.sqrt(torch.mean((ratings - predict)**2))
    print(rmse)

tensor(1.0324, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9380, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.0238, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9732, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.0709, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9973, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9951, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.0049, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.0535, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9505, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.0322, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9809, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9857, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.0569, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9326, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(0.9978, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.0824, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(1.1094, device='cuda:0',

In [23]:
# 모델 저장
torch.save(model.state_dict(), "model.pth")

In [75]:
print(ratings, predict)


tensor([2.0000, 2.5000, 3.0000, 3.0000, 4.5000, 5.0000, 4.0000, 3.0000, 5.0000,
        5.0000, 1.5000, 4.5000, 3.0000, 3.5000, 4.0000, 2.0000, 4.0000, 3.0000,
        4.0000, 5.0000, 2.0000, 3.5000, 3.5000, 5.0000, 4.0000, 4.0000, 3.0000,
        3.0000, 3.0000, 3.0000, 3.5000, 1.5000, 2.0000, 4.5000, 1.0000, 3.0000,
        5.0000, 0.5000, 4.0000, 4.0000, 3.5000, 4.0000, 3.5000, 4.0000, 4.0000,
        2.0000, 2.0000, 5.0000, 5.0000, 3.5000, 5.0000, 3.5000, 2.0000, 3.5000,
        3.0000, 3.0000, 5.0000, 3.5000, 3.5000, 3.5000, 1.5000, 3.0000, 1.0000,
        5.0000, 3.5000, 2.0000, 4.0000, 2.0000, 4.0000, 3.0000, 4.0000, 5.0000,
        4.0000, 3.5000, 3.0000, 2.0000, 4.5000, 3.5000, 4.0000, 4.0000, 3.0000,
        2.5000, 2.5000, 4.0000, 4.5000, 5.0000, 5.0000, 4.0000, 3.0000, 2.0000,
        4.0000, 5.0000, 4.0000, 4.5000, 3.0000, 4.0000, 3.0000, 4.5000, 4.5000,
        4.0000], device='cuda:0') tensor([3.1868, 3.5313, 3.4193, 3.7204, 3.5739, 3.6514, 3.4059, 3.2025, 3.4583,
      

In [76]:
# 모델 summary
from torchsummary import summary

summary(model, input_size=(100, 80), batch_size=256)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Dropout-1             [256, 100, 80]               0
            Linear-2            [256, 100, 256]          20,736
           Dropout-3            [256, 100, 256]               0
            Linear-4            [256, 100, 256]          65,792
           Dropout-5            [256, 100, 256]               0
            Linear-6            [256, 100, 256]          65,792
           Dropout-7            [256, 100, 256]               0
            Linear-8              [256, 100, 1]             257
Total params: 152,577
Trainable params: 152,577
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 7.81
Forward/backward pass size (MB): 315.82
Params size (MB): 0.58
Estimated Total Size (MB): 324.21
----------------------------------------------------------------
