# Setup

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"
os.environ["CUDA_LAUNCH_BLOCKING"]="1, 2, 3, 4"

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

# Import Data

In [3]:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../Datasets/ml-100k/Text/u.data', sep='\t', names=r_cols,encoding='latin-1')

In [4]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [5]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [6]:
data_matrix_emp = data_matrix.copy()
data_matrix_emp[data_matrix < 4] = 0
data_matrix_emp[data_matrix >= 4]= 1 

In [7]:
indices = list(zip(*(np.where(data_matrix != 0))))

In [8]:
np.unique(data_matrix, return_counts=True)

(array([0., 1., 2., 3., 4., 5.]),
 array([1486126,    6110,   11370,   27145,   34174,   21201]))

In [9]:
items_csv = "../Datasets/ml-100k/Text/items.csv"
train_ratings = "../Datasets/ml-100k/Text/u1.base"
test_ratings = "../Datasets/ml-100k/Text/u1.test"
embeddings = "../Datasets/ml-100k/Video/embeddings.csv"

In [10]:
embeddings_ = pd.read_csv(embeddings).to_numpy()

In [11]:
embeddings_.shape

(1682, 2048)

# Encoder network

## Data loader

In [12]:
import torch
from sklearn.preprocessing import normalize
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score

In [13]:
torch.cuda.is_available()

True

In [14]:
torch.__version__

'1.10.1'

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [16]:
class MovielensDataset(Dataset):
    def __init__(self, items_csv = items_csv, ratings = train_ratings, 
                 embeddings = embeddings, indices = indices, device = device):
        self.items = pd.read_csv(items_csv).to_numpy()
        self.item_embeddings = pd.read_csv(embeddings).to_numpy()
        self.ratings = pd.read_csv(ratings, sep='\t', 
                                   names=['user_id', 'movie_id', 'rating', 'unix_timestamp'],encoding='latin-1')
        self.indices = indices
        self.device = device
        self.data = None
        self.data_emp = None
        self.n_users = None
        self.n_items = None
        self.fill_ratings()
        self.embeddings()
    
    def fill_ratings(self, threshold=4):
        self.n_users = self.ratings.user_id.unique().shape[0]
        self.n_items = self.ratings.movie_id.unique().shape[0]
        
        self.data = np.zeros((943, 1682))
        for line in self.ratings.itertuples():
            self.data[line[1]-1, line[2]-1] = line[3]
        
    def embeddings(self):
        self.item_embeddings = np.nan_to_num(self.item_embeddings)
        self.item_embeddings = normalize(self.item_embeddings, axis=1, norm='l2')
        self.user_embeddings = self.data
#         self.user_embeddings = normalize(self.user_embeddings, axis=1, norm='l1')
        self.item_embedding_size = self.item_embeddings.shape[1]
        self.user_embedding_size = self.data.shape[1]
        
    def __len__(self):
        return(len(self.indices))
    
    def __getitem__(self, idx):
        user = self.indices[idx][0]
        item = self.indices[idx][1]

        x1 = torch.from_numpy(self.user_embeddings[user]).to(self.device)
        x2 = torch.from_numpy(self.item_embeddings[item]).to(self.device)
        
        y = self.data[user][item]
        return(x1.float(), x2.float(), int(y))

In [17]:
train_dataset = MovielensDataset(ratings = train_ratings)
test_dataset = MovielensDataset(ratings = test_ratings)

In [18]:
trainloader = DataLoader(train_dataset, batch_size = 30000, shuffle = True)
validloader = DataLoader(test_dataset, batch_size = 10000, shuffle = False)

## Architecture

In [19]:
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import OrderedDict

In [20]:
from ml_metrics import mapk
from recmetrics import mark

In [21]:
class SiameseNet(nn.Module):
    def __init__(self, device = device):
        super(SiameseNet, self).__init__()
        self.encoder_user = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.user_embedding_size, 512)),
            ('relu1', nn.Sigmoid()),
            ('linr2', nn.Linear(512, 300)),
            ('relu2', nn.Sigmoid()),
            ('linr3', nn.Linear(300, 256)),
            ('relu3', nn.LeakyReLU()),
            ('linr4', nn.Linear(256, 500)),
            ('relu4', nn.LeakyReLU()),
#             ('linr5', nn.Linear(300, dataset.item_embedding_size)),
#             ('relu5', nn.LeakyReLU()),
        ]))
        self.decoder_user = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(500, 256)),
            ('relu1', nn.Sigmoid()),
            ('linr2', nn.Linear(256, 300)),
            ('relu2', nn.Sigmoid()),
            ('linr3', nn.Linear(300, 512)),
            ('relu3', nn.LeakyReLU()),
            ('linr4', nn.Linear(512, train_dataset.user_embedding_size)),
#             ('relu4', nn.LeakyReLU()),
        ]))
        self.encoder_item = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.item_embedding_size, 1024)),
            ('relu1', nn.Sigmoid()),
            ('linr2', nn.Linear(1024, 512)),
            ('relu2', nn.Sigmoid()),
            ('linr3', nn.Linear(512, 500)),
#             ('btch3', nn.BatchNorm1d(200)),
#             ('relu3', nn.LeakyReLU()),
        ]))
        self.decoder_item = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(500, 512)),
            ('btch1', nn.BatchNorm1d(512)),
            ('relu1', nn.Sigmoid()),
            ('linr2', nn.Linear(512, 1024)),
            ('relu2', nn.Sigmoid()),
            ('linr3', nn.Linear(1024, train_dataset.item_embedding_size)),
# #             ('drop3', nn.Dropout(p=0.1)),
            ('btch3', nn.BatchNorm1d(train_dataset.item_embedding_size)),
            ('relu3', nn.Sigmoid()),
        ]))
        
        self.device = device
        self.encoder_user.apply(self.init_weights)
        self.encoder_item.apply(self.init_weights)
        self.decoder_user.apply(self.init_weights)
        self.decoder_item.apply(self.init_weights)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
        
    def forward(self, x1, x2):
        emb1 = self.encoder_user(x1)
        emb2 = self.encoder_item(x2)
        out1 = self.decoder_user(emb1)
        out2 = self.decoder_item(emb2)
        
        return(emb1, emb2, out1, out2)
    
    def fit(self, trainloader = trainloader, 
            validloader = validloader, epochs = 100):
        self.criterion_recon = nn.MSELoss()
        self.criterion_embed = nn.CosineEmbeddingLoss()
        self.optimizer = optim.Adam(self.parameters(), lr = 0.005, weight_decay = 1e-4)
        
        train_loss = []
        test_loss = []
        for epoch in range(epochs):
            running_loss = 0.0
            early_stop = 0
            
            for i, data in tqdm(enumerate(trainloader)):
                self.train()
                x1, x2, y = data
                y_flt = y.type(torch.LongTensor).to(device)
                y_lng = torch.div(y, 4, rounding_mode="floor").to(device)
                self.optimizer.zero_grad()
                emb1, emb2, out1, out2 = self.forward(x1, x2)
                loss_1 = self.criterion_recon(x1, out1)
                loss_2 = self.criterion_recon(x2, out2)
                loss_3 = self.criterion_embed(emb1, emb2, y_lng * 2 - 1)
                loss = loss_1 + loss_3 + loss_2
                loss.backward()
                self.optimizer.step()

                running_loss += loss
            val_loss = self.evaluate()
            
            print('Epoch-%d: Train Loss = %.3f Test Loss = %.3f'%
                  (epoch + 1, running_loss / i, val_loss))
            train_loss.append((running_loss / i).cpu().detach().numpy())
            test_loss.append(val_loss.cpu().detach().numpy())
        return(train_loss, test_loss)
            
    def evaluate(self, k = 3):
        self.eval()
        with torch.no_grad():
            valdata = next(iter(validloader))
            x1, x2, y = valdata
            y_flt = y.type(torch.FloatTensor).to(device)
            y_lng = torch.div(y, 4, rounding_mode="floor").to(device)
            emb1, emb2, out1, out2 = self.forward(x1, x2)
            loss_1 = self.criterion_recon(x1, out1)
            loss_2 = self.criterion_recon(x2, out2)
            loss_3 = self.criterion_embed(emb1, emb2, y_lng * 2 - 1)
            print(loss_1, loss_2, loss_3)
            loss = (loss_1 + loss_3 + loss_2) 
        return(loss)
    
    def predict(self, data):
        return(self.encoder_user(torch.Tensor(data).to(device)).cpu().detach().numpy())

In [22]:
sm_net = SiameseNet()
sm_net.to(device)

SiameseNet(
  (encoder_user): Sequential(
    (linr1): Linear(in_features=1682, out_features=512, bias=True)
    (relu1): Sigmoid()
    (linr2): Linear(in_features=512, out_features=300, bias=True)
    (relu2): Sigmoid()
    (linr3): Linear(in_features=300, out_features=256, bias=True)
    (relu3): LeakyReLU(negative_slope=0.01)
    (linr4): Linear(in_features=256, out_features=500, bias=True)
    (relu4): LeakyReLU(negative_slope=0.01)
  )
  (decoder_user): Sequential(
    (linr1): Linear(in_features=500, out_features=256, bias=True)
    (relu1): Sigmoid()
    (linr2): Linear(in_features=256, out_features=300, bias=True)
    (relu2): Sigmoid()
    (linr3): Linear(in_features=300, out_features=512, bias=True)
    (relu3): LeakyReLU(negative_slope=0.01)
    (linr4): Linear(in_features=512, out_features=1682, bias=True)
  )
  (encoder_item): Sequential(
    (linr1): Linear(in_features=2048, out_features=1024, bias=True)
    (relu1): Sigmoid()
    (linr2): Linear(in_features=1024, out_fea

In [None]:
train_loss, test_loss = sm_net.fit()

4it [01:50, 27.71s/it]


tensor(0.7173, device='cuda:0') tensor(0.0840, device='cuda:0') tensor(0.3671, device='cuda:0')
Epoch-1: Train Loss = 2.580 Test Loss = 1.168


4it [01:49, 27.26s/it]


tensor(0.7617, device='cuda:0') tensor(0.0761, device='cuda:0') tensor(0.3468, device='cuda:0')
Epoch-2: Train Loss = 2.275 Test Loss = 1.185


1it [00:33, 33.35s/it]

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss, label = "Train Loss")
plt.plot(test_loss, label = "Test Loss")
plt.legend()
plt.show()

In [None]:
# output_user = sm_net.encoder_user(torch.Tensor(data_matrix).to(device)).cpu().detach().numpy()

In [None]:
# pd.DataFrame(output_user).to_csv("user.csv", index = False)

In [None]:
# sm_net.decoder_user(torch.Tensor(output_user).to(device)).cpu().detach().numpy()

In [None]:
# data_matrix

In [None]:
output_item = sm_net.encoder_item(torch.Tensor(embeddings_).to(device)).cpu().detach().numpy()

In [None]:
pd.DataFrame(output_item).to_csv("../Datasets/ml-100k/Compressed/video.csv", index = False)

In [None]:
sm_net.decoder_item(torch.Tensor(output_item).to(device)).cpu().detach().numpy()

In [None]:
embeddings_

##### 