In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, SubsetRandomSampler

In [3]:
import numpy as np
import os
import random
import pandas as pd

In [4]:
from Dataset import Data
from bprModel import BPR

In [5]:
def load_data():    
    df = pd.read_csv('train.csv')
    user_items = {}
    itemId_max=[]
    for i,row in df.iterrows():
        user = int(row[0])
        user_items[user] = [int(x) for x in row[1].split()]
        itemId_max.append(max(user_items[user]))
    num_users = max(user_items.keys())+1
    num_items = max(itemId_max)+1
    return num_users, num_items, user_items

In [6]:
def train(loader, model, optimizer, epochs, batch_size, device):

    #total_loss = 0.0
    #batch_count = 0

    for epoch in range(epochs):
        train_loss = []
        val_loss = []
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train(True)  
            else:
                model.train(False)  
            
            model.to(device)
            data_loaders[phase].dataset.get_neg()
            for batch, (batch_u, batch_i, batch_j) in enumerate(data_loaders[phase]):
                
                batch_u = batch_u.to(device)
                batch_i = batch_i.to(device)
                batch_j = batch_j.to(device)
            
                loss = model(batch_u, batch_i, batch_j)
                
                optimizer.zero_grad()
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                    train_loss.append(loss.data)
                else:
                    val_loss.append(loss.data)
                
                #batch_count += 1
                #total_loss += loss.data

                #avg_loss = total_loss / batch_count
            
            
        print(f"Training Epoch : {epoch} | Train Loss = {np.mean(train_loss)/batch_size:.4f} | Val Loss = {np.mean(val_loss)/batch_size:.4f}\n")
                       

In [7]:
user_size, item_size,user_items = load_data()

In [8]:
batch_size = 3000
epochs = 3
embedding_size = 128

In [9]:
dataset = Data(user_size, item_size, user_items)
validation_split = 0.1
shuffle_dataset = True

dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(233)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler)
data_loaders = {"train": train_loader, "val": validation_loader}
data_lengths = {"train": len(train_indices), "val": len(val_indices)}

In [10]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [11]:
model = BPR(user_size, item_size, embedding_size, batch_size, device)

In [12]:
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [13]:
train(data_loaders, model, optimizer, epochs, batch_size, device)

Training Epoch : 0 | Train Loss = 0.6862 | Val Loss = 0.6164

Training Epoch : 1 | Train Loss = 0.5024 | Val Loss = 0.3529

Training Epoch : 2 | Train Loss = 0.3383 | Val Loss = 0.2988



In [14]:
w = list(model.parameters())

In [15]:
user = w[0].detach().numpy()

In [16]:
item =  w[1].detach().numpy()

In [17]:
interaction = np.dot(user,item.T)

In [18]:
predict = pd.DataFrame(columns=['UserId','ItemId'])

In [19]:
for uid, items in enumerate(interaction):
    for i in user_items[uid]:
        items[i] = -99
    topk = np.argsort(-items)[:50]
    predict.loc[uid,'UserId'] = uid
    predict.loc[uid,'ItemId'] = ' '.join([str(x) for x in topk])

In [20]:
predict.iloc[0]

UserId                                                    0
ItemId    1575 2022 2885 865 2320 2470 465 1770 2726 118...
Name: 0, dtype: object

In [21]:
predict.shape

(4454, 2)

In [22]:
predict.to_csv('submit_612.csv',index = 0)