# Setup

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"
os.environ["CUDA_LAUNCH_BLOCKING"]="3"

In [2]:
import torch
import random
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from numpy.ma import masked_where

In [3]:
from sklearn.preprocessing import normalize
from torch.utils.data import Dataset, DataLoader
from torchsampler import ImbalancedDatasetSampler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score

# Import Data

In [4]:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../Datasets/ml-100k/Text/u1.base', sep='\t', names=r_cols,encoding='latin-1')

In [5]:
n_users = 943
n_items = 1682

In [6]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [7]:
data_matrix_emp = data_matrix.copy()
data_matrix_emp[data_matrix < 4] = 0
data_matrix_emp[data_matrix >= 4]= 1 

In [8]:
train_indices = list(zip(*(np.where(data_matrix != 0))))

In [9]:
len(train_indices)

80000

# Siamese network

## Data loader

In [10]:
torch.cuda.is_available()

True

In [11]:
torch.__version__

'1.10.1'

In [12]:
items_csv = "../Datasets/ml-100k/Text/items.csv"
train_ratings = "../Datasets/ml-100k/Text/u1.base"
test_ratings = "../Datasets/ml-100k/Text/u1.test"

item_path = "../Datasets/ml-100k/"
ROW = 30
BATCH = 20000

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [14]:
class MovielensDataset(Dataset):
    def __init__(self, ratings = train_ratings, item_path = item_path, device = device):
        self.item_path = item_path
#         self.video_embeddings = pd.read_csv(item_path + "Video/embeddings.csv").to_numpy()
#         self.audio_embeddings = pd.read_csv(item_path + "Audio/embeddings.csv").to_numpy()
        self.meta_embeddings = pd.read_csv(item_path + "Meta/embeddings.csv").to_numpy()
        self.text_embeddings = pd.read_csv(item_path + "Text/embeddings.csv").to_numpy()
        self.ratings = pd.read_csv(ratings, sep='\t', 
                                   names=['user_id', 'movie_id', 'rating', 'unix_timestamp'],encoding='latin-1')
        self.indices = None
        self.device = device
        self.data = None
        self.n_users = None
        self.n_items = None
        self.fill_ratings()
        self.embeddings()
    
    def fill_ratings(self, threshold=4):
        self.n_users = self.ratings.user_id.unique().shape[0]
        self.n_items = self.ratings.movie_id.unique().shape[0]
        
        self.data = np.zeros((n_users, n_items))
        for line in self.ratings.itertuples():
            self.data[line[1]-1, line[2]-1] = line[3]
        
        self.data_emp = np.where(np.logical_and(self.data > 3,
                            np.random.random_sample(self.data.shape) <= 0.2), 1, 0)
        self.indices = list(zip(*(np.where(self.data != 0))))
    
    def fill_med_embeddings(self, EXT = "*.pkl", type_ = "audioEmbed/"):
        all_csv_files = [self.item_path + type_ + str(i) + ".pkl" for i in range(1682)]

        k = [normalize(np.load(file, allow_pickle = True)[:ROW], axis = 0) for file in tqdm(all_csv_files)]
        return(np.stack(tuple(k)))
        
    def embeddings(self):
        self.audio_embeddings = self.fill_med_embeddings(type_ = "audioEmbed/")
        self.video_embeddings = self.fill_med_embeddings(type_ = "videoEmbed/")
#         self.audio_embeddings = normalize(self.audio_embeddings, axis = 0)
#         self.video_embeddings = normalize(self.video_embeddings, axis = 0)
        self.user_embeddings = np.divide(np.dot(self.data_emp, self.meta_embeddings), 
                                         self.data_emp.sum(axis = 1)[:, None] + 0.001)
#         self.user_embeddings = data_matrix
        self.item_embeddings = data_matrix.T
        self.video_embedding_size = self.video_embeddings.shape[2]
        self.audio_embedding_size = self.audio_embeddings.shape[2]
        self.text_embedding_size = self.text_embeddings.shape[1]
        self.user_embedding_size = self.user_embeddings.shape[1]
        self.item_embedding_size = self.item_embeddings.shape[1]
        self.meta_embedding_size = self.meta_embeddings.shape[1]
        
    def __len__(self):
        return(len(self.indices))
    
    def __getitem__(self, idx):
        user = self.indices[idx][0]
        item = self.indices[idx][1]
        
#         xu = self.user_embeddings(torch.LongTensor([user])).squeeze().to(self.device)
        xu = torch.from_numpy(self.user_embeddings[user]).to(self.device)
        xa = torch.from_numpy(self.audio_embeddings[item]).to(self.device)
        xv = torch.from_numpy(self.video_embeddings[item]).to(self.device)
        xt = torch.from_numpy(self.text_embeddings[item]).to(self.device)
        xi = torch.from_numpy(self.item_embeddings[item]).to(self.device)
        xm = torch.from_numpy(self.meta_embeddings[item]).to(self.device)
        
        y = self.data[user][item]
        return(xu.float(), [xv.float(), xa.float(), xt.float(), xi.float(), xm.float()], int(y))

In [15]:
train_dataset = MovielensDataset(ratings = train_ratings)
test_dataset = MovielensDataset(ratings = test_ratings)

100%|██████████| 1682/1682 [00:09<00:00, 179.58it/s]
100%|██████████| 1682/1682 [00:00<00:00, 4891.98it/s]


ValueError: Found array with dim 3. the normalize function expected <= 2.

In [None]:
trainloader = DataLoader(train_dataset, batch_size = BATCH, shuffle = True)
testloader = DataLoader(test_dataset, batch_size = BATCH, shuffle = False)

## Architecture

In [None]:
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import OrderedDict

In [None]:
from ml_metrics import mapk
from recmetrics import mark

In [None]:
weight = np.array([np.count_nonzero(train_dataset.data == i) for i in range(1, 6)])
weight = weight.max() / weight
weight = torch.Tensor(weight).to(device)

In [None]:
def weighted_mse_loss(pred, target, weight=weight):
    target = target.long()
    weight = weight[target - 1].to(pred.dtype)
    loss = (pred - target.to(pred.dtype)).pow(2)
    return ((weight * loss).mean(), loss.mean())

In [None]:
class SiameseNet(nn.Module):
    def __init__(self, device = device, channel = 84):
        super(SiameseNet, self).__init__()
        self.encoder_user = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.user_embedding_size, 1024)),
            ('relu1', nn.LeakyReLU()),
            ('linr2', nn.Linear(1024, channel)),
            ('relu2', nn.LeakyReLU()),
        ]))
        
        self.encoder_item = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.item_embedding_size, 256)),
            ('relu1', nn.LeakyReLU()),
            ('linr2', nn.Linear(256, 300)),
            ('relu2', nn.LeakyReLU()),
        ]))
        
        self.encoder_video = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.video_embedding_size, 1600)),
            ('relu1', nn.LeakyReLU()),
            ('linr2', nn.Linear(1600, 300)),
            ('relu2', nn.LeakyReLU()),
#             ('linr3', nn.Linear(1000, 500)),
#             ('relu3', nn.LeakyReLU()),
# #             ('norm2', nn.BatchNorm1d(500)),
#             ('linr4', nn.Linear(500, 300)),
#             ('relu4', nn.LeakyReLU()),
        ]))
        
        self.encoder_audio = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.audio_embedding_size, 600)),
            ('relu1', nn.LeakyReLU()),
            ('linr4', nn.Linear(600, 300)),
            ('relu4', nn.LeakyReLU()),
        ]))
        
        self.encoder_text = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.text_embedding_size, 256)),
            ('relu1', nn.LeakyReLU()),
            ('linr2', nn.Linear(256, 300)),
            ('relu2', nn.LeakyReLU()),
        ]))
        
        self.encoder_meta = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(train_dataset.meta_embedding_size, 1600)),
            ('relu1', nn.LeakyReLU()),
#             ('norm1', nn.BatchNorm1d(1600)),
            ('linr2', nn.Linear(1600, 300)),
            ('relu2', nn.LeakyReLU()),
#             ('linr3', nn.Linear(1000, 500)),
#             ('relu3', nn.LeakyReLU()),
#             ('linr4', nn.Linear(500, 300)),
#             ('relu4', nn.LeakyReLU()),
        ]))
        
        self.fusion = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(1, 1, (20, 20), stride=(2, 2))),
            ('relu1', nn.LeakyReLU()),
            ('conv2', nn.Conv2d(1, 1, (5, 50), stride=(1, 1))),
            ('relu2', nn.LeakyReLU()),
        ]))
        
        self.siamese = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(channel, 200)),
            ('relu1', nn.LeakyReLU()),
            ('linr2', nn.Linear(200, 256)),
            ('relu2', nn.LeakyReLU()),
            ('linr3', nn.Linear(256, 100)),
            ('relu3', nn.LeakyReLU()),
        ]))
        
        self.ffn = nn.Sequential(OrderedDict([
            ('linr1', nn.Linear(300, 164)),
            ('actv1', nn.ReLU()),
            ('linr2', nn.Linear(164, 1)),
#             ('actv2', nn.ReLU()),
#             ('linr3', nn.Linear(50, 1)),
        ]))
        
        self.device = device
        self.encoder_user.apply(self.init_weights)
        self.encoder_item.apply(self.init_weights)
        self.encoder_video.apply(self.init_weights)
        self.encoder_audio.apply(self.init_weights)
        self.encoder_text.apply(self.init_weights)
        self.encoder_meta.apply(self.init_weights)
        self.siamese.apply(self.init_weights)
        self.ffn.apply(self.init_weights)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight)
            m.bias.data.fill_(0.01)
            
    def exp(self, q, k, v):
        z = torch.bmm(q, k.permute(0, 2, 1))
        z = F.normalize(z, p = 10, dim = 1)
        z = torch.softmax(z, 1)
        z = torch.bmm(z, v)
        return(z)
        
    def forward(self, x1, x2):
        # Modality-encoders
        outu = self.encoder_user(x1)
        outr = self.encoder_item(x2[3])
        outv = torch.split(self.encoder_video(x2[0]), [100, 100, 100], 2)
        outa = torch.split(self.encoder_audio(x2[1]), [100, 100, 100], 2)
        outt = torch.split(self.encoder_text(x2[2]), 100, 1)
        outm = torch.split(self.encoder_meta(x2[4]), [100, 100, 100], 1)
        
        # Attention
        q_t = outt[0].unsqueeze(1).repeat(1, ROW, 1)
        k_t = outt[1].unsqueeze(1).repeat(1, ROW, 1)
        v_t = outt[2].unsqueeze(1).repeat(1, ROW, 1)
#         print(q_t.shape, k_t.shape, v_t.shape)

        q_a = outa[0]
        k_a = outa[1]
        v_a = outa[2]
#         print(q_a.shape, k_a.shape, v_a.shape)
        
        q_v = outv[0]#.unsqueeze(1).repeat(1, ROW, 1)
        k_v = outv[1]#.unsqueeze(1).repeat(1, ROW, 1)
        v_v = outv[2]#.unsqueeze(1).repeat(1, ROW, 1)
#         print(q_v.shape, k_v.shape, v_v.shape)
        
        q_m = outm[0].unsqueeze(1).repeat(1, ROW, 1)
        k_m = outm[1].unsqueeze(1).repeat(1, ROW, 1)
        v_m = outm[2].unsqueeze(1).repeat(1, ROW, 1)
#         print(q_m.shape, k_m.shape, v_m.shape)
        
        # Self-Attention
        st = self.exp(q_t, k_t, v_t)
        sm = self.exp(q_m, k_m, v_m)
        sa = self.exp(q_a, k_a, v_a)
        sv = self.exp(q_v, k_v, v_v)

        # Inter-Modal Attention
        ita = self.exp(q_a, k_t, v_a)
        imv = self.exp(q_v, k_m, v_v)
        itm = self.exp(q_m, k_t, v_m)
        
        # Forward
        ma = torch.mean(torch.stack([sa, ita]), 0)
        mv = torch.mean(torch.stack([sv, imv]), 0)
        sda = itm
        se = torch.mul(ma, mv)
        outi = torch.cat((sda, se), axis = 2)#.reshape(-1, ROW * 1200)
        outi = self.fusion(outi.unsqueeze(1))
        out1 = self.siamese(outu)
        out2 = self.siamese(outi.reshape(BATCH, -1))
        diff = torch.cat((out1, out2, outm[2]), axis=1)
        out = self.ffn(diff)
        return(out, out1, out2)
    
    def fit(self, trainloader = trainloader, 
            testloader = testloader, epochs = 100):
        self.criterion_rate = weighted_mse_loss
        self.criterion_embd = nn.CosineEmbeddingLoss()
        self.optimizer = optim.Adam(self.parameters(), lr = 1e-4)
        
        train_loss = []
        train_f1 = []
        test_loss = []
        test_f1 = []
        for epoch in range(epochs):
            running_loss = 0.0
            running_loss_1 = 0.0
            
            for i, data in tqdm(enumerate(trainloader)):
                self.train()
                x1, x2, y = data
                y_flt = y.type(torch.FloatTensor).to(device)
                y_lng = torch.div(y, 4, rounding_mode="floor").to(device)
                self.optimizer.zero_grad()
                reg, outu, outi = self.forward(x1, x2)
                loss_1, loss_ = self.criterion_rate(reg.squeeze(), y_flt)
                loss_2 = self.criterion_embd(outu, outi, y_lng * 2 - 1)
                loss = loss_1 + loss_2 
                loss.backward()
                self.optimizer.step()

                running_loss_1 += torch.sqrt(loss_)
                running_loss += loss
            vl, vp, vr, vf, tp, tr, tf = self.evaluate()
            print('Epoch-%d: Loss = %.3f\nTrain RMSE = %.3f||Train Precision = %.3f||Train Recall = %.3f\nTest RMSE = %.3f || Test Precision = %.3f|| Test Recall = %.3f'%
                  (epoch + 1, running_loss / i, running_loss_1 / i, 
                   tp, tr, vl, vp, vr))
            train_loss.append((running_loss_1 / i).cpu().detach().numpy())
            test_loss.append(vl.cpu().detach().numpy())
            train_f1.append(tf)
            test_f1.append(vf)
        return(train_loss, test_loss, train_f1, test_f1)
            
    def evaluate(self, k = 3.5):
        self.eval()
        with torch.no_grad():
            valdata = next(iter(testloader))
            x1, x2, y = valdata
            y_flt = y.type(torch.FloatTensor).to(device)
            y_lng = torch.div(y, 4, rounding_mode="floor").to(device)
            otpt = self.forward(x1, x2)
            print(otpt[0])
            pred = (otpt[0] > k).float()
            vl = torch.sqrt(self.criterion_rate(otpt[0].squeeze(), y_flt)[1])
            vp = precision_score(y_lng.cpu(), pred.cpu(), zero_division = 0)
            vr = recall_score(y_lng.cpu(), pred.cpu(), zero_division = 0)
            vf = f1_score(y_lng.cpu(), pred.cpu(), zero_division = 0)
            print(classification_report(y_lng.cpu(), pred.cpu(),
                    target_names = ["0", "1"], zero_division = 0))
            
            traindata = next(iter(trainloader))
            x1, x2, y = traindata
            y_flt = y.type(torch.FloatTensor).to(device)
            y_lng = torch.div(y, 4, rounding_mode="floor").to(device)
            otpt = self.forward(x1, x2)
            pred = (otpt[0] > k).float()
            tp = precision_score(y_lng.cpu(), pred.cpu(), zero_division = 0)
            tr = recall_score(y_lng.cpu(), pred.cpu(), zero_division = 0)
            tf = f1_score(y_lng.cpu(), pred.cpu(), zero_division = 0)
        return(vl, vp*100, vr*100, vf*100, tp*100, tr*100, tf*100)

In [None]:
sm_net = SiameseNet()
sm_net.to(device)

In [None]:
train_loss, test_loss, train_f1, test_f1 = sm_net.fit()

In [None]:
torch.save(sm_net, "./pretrained/attention.pth")

In [None]:
sm_net.evaluate(k=3.5)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax2 = ax.twinx()
ax.plot(train_loss[20:], label = "Train Loss", color = "orange")
ax2.plot(test_loss[20:], label = "Test Loss")
fig.legend([ax, ax2], labels = ["Train Loss", "Test Loss"], loc = "upper right")
plt.show()

# Test

In [None]:
# import matplotlib.pyplot as plt
# import torch

In [None]:
# sm_net = torch.load("./pretrained/rating.pth")

In [None]:
# prs = []
# rec = []
# f1 = []
# loss = []

# for i in np.arange(3, 4, 0.1):
#     l, p, r, f, _, _, _ = sm_net.evaluate(i)
#     prs.append(p)
#     rec.append(r)
#     f1.append(f)
#     loss.append(l)
    
# plt.plot(prs, label = "Test Precision")
# plt.plot(rec, label = "Test Recall")
# plt.plot(f1, label = "Test F1")

# plt.legend()
# plt.show()

In [None]:
# prs

In [None]:
# rec

In [None]:
# loss