In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
import plotly.express as px
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset
from torch import nn
import heapq
import math
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import multiprocessing
import plotly.io as pio

pio.renderers.default = "notebook+pdf"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [28]:
learning_rate = 0.01
factors = [8,16,32,64]
epochs = 1
batch_size = 1024

all_out=[]

In [5]:
# Negative Testing Samples oversampled using neg_num
neg_num = 99
neg_samples = {}

# Pre-Processing

In [6]:
df = pd.read_csv('ratings.dat',sep="::",names = ["users","items","rating","timestamp"])
n = len(df)

  return func(*args, **kwargs)


In [7]:
df['rank_latest'] = df.groupby(['users'])['timestamp'].rank(method='first', ascending=False)

In [8]:
df['rating']=1
train_df, test_df = df[df['rank_latest']>1],df[df['rank_latest']==1]

In [9]:
# Final training samples with negatives 
neg_per_pos = 4    # Negative interactions to be sampled per positive interaction
users_all = df['users'].unique()
items_all = set(df['items'].unique())

In [10]:
def prep_data(df,users_all,items_all,neg_train=100):
    users_fin = df['users'].values.copy() 
    items_fin = df['items'].values.copy()
    ratings_fin = df['rating'].values.copy().astype(float)

    for u in tqdm(users_all):
        items_per_user = set(df[df.users==u]['items'].unique()) 

        neg_item = np.random.choice(list(items_all-items_per_user),len(items_per_user)*neg_per_pos)   

        users_fin=np.append(users_fin,np.repeat(u,len(items_per_user)*neg_per_pos))
        items_fin=np.append(items_fin,neg_item)
        ratings_fin=np.append(ratings_fin,np.repeat(0,len(items_per_user)*neg_per_pos))

    return [users_fin, items_fin, ratings_fin]

In [11]:
class MovieUserItemRatingDataset(Dataset):
    def __init__(self, users,items,ratings):
        self.users = torch.from_numpy(users)
        self.items = torch.from_numpy(items)
        self.ratings = torch.from_numpy(ratings)
        
    def __getitem__(self,index):
        return [self.users[index],self.items[index],self.ratings[index].float()]

    def __len__(self):
        return len(self.users)  

In [12]:
%%time
train_users, train_items, train_ratings = prep_data(train_df,users_all,items_all,neg_per_pos)
training_data = MovieUserItemRatingDataset(train_users,train_items,train_ratings)
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True,num_workers= 32, pin_memory=True)

100%|██████████| 6040/6040 [03:04<00:00, 32.76it/s] 

CPU times: user 1min 46s, sys: 1min 18s, total: 3min 5s
Wall time: 3min 4s





In [13]:
n_users = df['users'].nunique()+1
n_items = np.max(train_df['items'])+1

# Training Utils

In [16]:

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    tot_loss = []
    for batch, (X_users, X_items, y) in enumerate(dataloader):
        X_users, X_items, y = X_users.to(device), X_items.to(device), y.to(device)

        # Compute prediction error
        pred = model(X_users,X_items)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X_users)
            tot_loss.append(loss)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        
    print(f"Mean Training Loss: {np.mean(tot_loss)} ")        
    return np.mean(tot_loss)

In [17]:

def evaluate_test(test_df, model, k=10, neg_num=100):
    users_test_all = test_df['users'].unique()
    ndcg,hit,total = 0, 0, len(users_test_all)
    tot_loss=[]
    
    for i in range(len(test_df)):
        user,item = test_df.iloc[i]['users'],test_df.iloc[i]['items']
        if user not in neg_samples:     
            neg_item = np.random.choice(list(items_all-set([int(item)])),neg_num)   
            neg_samples[user]=neg_item

        users_test=torch.LongTensor(np.repeat(user,neg_num+1)).to(device)
        items_test=torch.LongTensor(np.append(item,neg_samples[user])).to(device)
        ratings_test = torch.FloatTensor(np.append(1,np.repeat(0,neg_num))).to(device)
        
        model.eval()
        with torch.no_grad():
            pred = model(users_test,items_test)
            loss = loss_fn(pred, ratings_test)
            tot_loss.append(loss.item())

        hp = []
        for curr_item,curr_rating in zip(items_test.cpu().numpy(),pred.cpu().numpy()):
            heapq.heappush(hp,(curr_rating,curr_item))

        topK = heapq.nlargest(k,hp,key=lambda x: x[0])
        topKitems = set([x[1] for x in topK])
        if item in topKitems:
            hit+=1

        for j,itemi in enumerate(topKitems):
            if itemi==item:
                ndcg+= math.log(2)/math.log(j+2)

    print(f"Mean Test Loss: {np.mean(tot_loss)}")
    print(f"\n Hit Ratio | HR@{k} : {hit/total}")
    print(f" NDCG@{k} : {ndcg/total}")    
    

    return [hit/total , ndcg/total, np.mean(tot_loss)]

# GMF

In [18]:
class GMF(nn.Module):
    def __init__(self, n_users, n_items, n_factors=20, MF = False):
        super().__init__()
        # create user embeddings
        self.user_embeddings_GMF = torch.nn.Embedding(n_users, n_factors,
                                                sparse=False)
        # create item embeddings
        self.item_embeddings_GMF = torch.nn.Embedding(n_items, n_factors,
                                                sparse=False)
        self.h_output = torch.nn.Linear(in_features=n_factors,out_features=1)
        self.activation = torch.nn.Sigmoid()
        self.MF = MF

    def forward(self, user, item):
        # element-wise multiplication
        x = self.user_embeddings_GMF(user)*self.item_embeddings_GMF(item)

        if self.MF:
            return self.activation(x.sum(1))

        x = self.h_output(x)
        x = self.activation(x)

        return torch.squeeze(x)

In [None]:
# Training model for different factors.

for f in factors:
    gmf = GMF(n_users = n_users, n_items = n_items, n_factors = f).to(device)
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(gmf.parameters(), lr=learning_rate)
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        loss = train(train_dataloader, gmf, loss_fn, optimizer)
        hr, ndcg, test_loss = evaluate_test(test_df, gmf, k=10,neg_num=neg_num)
        all_out.append(["GMF",t,f,loss,test_loss,hr,ndcg])
        print("-------------------------------")
        
    torch.save(gmf.state_dict(), f"{f}_GMF.pt")
    print(f"Factor {f} | HR@10: {hr} | NDCG@10: {ndcg}")
    


# MLP

In [25]:
class MLP(nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_embeddings_MLP = torch.nn.Embedding(n_users, 2*n_factors,
                                                sparse=False)
        # create item embeddings
        self.item_embeddings_MLP = torch.nn.Embedding(n_items, 2*n_factors,
                                                sparse=False)
        
        # Neural CF layers
        self.MLP = nn.Sequential(
            nn.Linear(4*n_factors,2*n_factors),
            nn.ReLU(),
            nn.Linear(2*n_factors,n_factors),
            nn.ReLU(),
            nn.Linear(n_factors,1),
            nn.Sigmoid(),   
        )

    def forward(self, user, item):
        # concat user + item embeddings
        x = torch.cat((self.user_embeddings_MLP(user),self.item_embeddings_MLP(item)),1)
        x = self.MLP(x)

        return torch.squeeze(x)

In [None]:
# Training model for different factors.

for f in factors:
    mlp = MLP(n_users = n_users, n_items = n_items, n_factors = f).to(device)
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate)
    
    losses_mlp,hrs_mlp,ndcgs_mlp = [],[],[]
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        loss = train(train_dataloader, mlp, loss_fn, optimizer)
        hr, ndcg,test_loss = evaluate_test(test_df, mlp, k=10,neg_num=neg_num)
        all_out.append(["MLP",t,f,loss,test_loss,hr,ndcg])
        print("-------------------------------")
    
    torch.save(mlp.state_dict(),f"{f}_MLP.pt")    
    print(f"Factor {f} | HR@10: {hr} | NDCG@10: {ndcg}")

# NeuMF

In [32]:
class NeuMF(nn.Module):
    def __init__(self, n_users, n_items, n_factors=20,alpha=0.5):
        super().__init__()
        # MLP: create user embeddings
        self.user_embeddings_MLP = torch.nn.Embedding(n_users, 2*n_factors,
                                                sparse=False)
        # MLP: create item embeddings
        self.item_embeddings_MLP = torch.nn.Embedding(n_items, 2*n_factors,
                                                sparse=False)

        # GMF: create user embeddings
        self.user_embeddings_GMF = torch.nn.Embedding(n_users, n_factors,
                                                sparse=False)
        # GMF: create item embeddings
        self.item_embeddings_GMF = torch.nn.Embedding(n_items, n_factors,
                                                sparse=False)
        # Neural CF layers for MLP
        self.MLP_mod = nn.Sequential(
            nn.Linear(4*n_factors,2*n_factors),
            nn.ReLU(), 
            nn.Dropout(0.75),
            nn.Linear(2*n_factors,n_factors),
            nn.ReLU(), 
            nn.Dropout(0.5)
        )
        
        self.NMF_Linear = nn.Linear(2*n_factors,1)
        self.activation = nn.Sigmoid()
        self.alpha=alpha

    def forward(self, user, item):
        # concat user + item embeddings
        mlp_x = torch.cat((self.user_embeddings_MLP(user),self.item_embeddings_MLP(item)),1)
        mlp_x = self.MLP_mod(mlp_x)
        
        mf_x = self.user_embeddings_GMF(user)*self.item_embeddings_GMF(item)
        
        # Merge MLP and GMP (alpha: weightage of each model)
        x = torch.cat((self.alpha*mlp_x,(1-self.alpha)*mf_x),1)
        x = self.NMF_Linear(x)
        x = self.activation(x)

        return torch.squeeze(x)

In [None]:
# Training model for different factors.

for f in factors:
    learning_rate = .001
    if f==32 or f==64:
        learning_rate = 0.0001
    
    nmf = NeuMF(n_users = n_users, n_items = n_items, n_factors = f,alpha=0.5).to(device)
    nmf.load_state_dict(torch.load(f"{f}_GMF.pt"),strict=False)
    nmf.load_state_dict(torch.load(f"{f}_MLP.pt"),strict=False)
    
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(nmf.parameters(), lr=learning_rate)
    
    for t in range(epochs):        
        print(f"Epoch {t+1}\n-------------------------------")            
        loss = train(train_dataloader, nmf, loss_fn, optimizer)
        hr, ndcg,test_loss = evaluate_test(test_df, nmf, k=10,neg_num=neg_num)
        all_out.append(["NeuMF",t,f,loss,test_loss,hr,ndcg])
        
        print("-------------------------------")
        
    torch.save(mlp.state_dict(),f"{f}_NMF.pt")  
    fin_hrs_nmf.append(hr_max)
    fin_ndcgs_nmf.append(ndcg_max)
    
    print(f"Factor {f} | HR@10: {hr} | NDCG@10: {ndcg}")

# Evaluation

In [45]:
plot_df = pd.DataFrame(all_out, columns = ["model","epoch","factor","train_loss","test_loss","HR@10","NDCG@10"])

In [32]:
plot_melt = pd.melt(plot_df,id_vars=['model','epoch','factor'],value_vars=['train_loss','HR@10','NDCG@10'],var_name="metric")
plot_melt.head(5)

Unnamed: 0,model,epoch,factor,metric,value
0,GMF,0,8,train_loss,0.556411
1,GMF,1,8,train_loss,0.358291
2,GMF,2,8,train_loss,0.31551
3,GMF,3,8,train_loss,0.294344
4,GMF,4,8,train_loss,0.277073


In [36]:
# Epoch-wise Loss, HR@10, NDCG@10 for each model

fig=px.line(plot_melt,x="epoch",y="value",color="model",facet_col="metric",facet_row="factor",height=800)
fig.update_yaxes(matches=None)
fig.update_yaxes(type="log")
fig.update_traces(mode='markers+lines')
fig.write_image("plot1.png")
fig.show()


In [35]:
# Factor-wise Metric evaluatio for each model

fig=px.line(plot_melt[(plot_melt['epoch']==9) & ( (plot_melt['metric']=="HR@10") | (plot_melt['metric']=="NDCG@10")  )],x="factor",y="value",color="model",facet_col="metric")
fig.update_yaxes(matches=None)
fig.update_yaxes(type="log")
fig.update_traces(mode='markers+lines')
fig.write_image("plot2.png")
fig.show()

