## Install Dependencies

In [1]:
import os
import requests
import sklearn
import random
import zipfile
import torch
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.optim as optim
from zipfile import ZipFile
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
!nvidia-smi

Thu Apr 25 14:21:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-12GB           Off | 00000000:04:00.0 Off |                    0 |
| N/A   37C    P0              29W / 250W |      0MiB / 12288MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
class MovieUserRatingDataset(Dataset):
    def __init__(self,
                 rated_users: pd.DataFrame,
                 total_samples: pd.DataFrame,
                 unrated_user_ratio: int
                 ) -> None:
        super(MovieUserRatingDataset, self).__init__()

        self.rated_users = rated_users
        self.total_samples = total_samples
        self.unrated_user_ratio = unrated_user_ratio

        self.users, self.items, self.ratings = self.unrated_user_sampling()

    
    def unrated_user_sampling(self):
        rated_users = self.rated_users
        total_samples = self.total_samples
        users_list, items_list, labels_list = [], [], []
        user_item_set = set(zip(rated_users['userId'], rated_users['movieId']))
        total_user_item_set = set(zip(total_samples['userId'], total_samples['movieId']))
        all_movie_ids = total_samples['movieId'].unique()
        unrated_user_ratio = self.unrated_user_ratio

        for user, item in user_item_set:
            # Add positive instance
            users_list.append(user)
            items_list.append(item)
            labels_list.append(1.0)

            # Initialize visited items
            rated = [item]

            # Add negative instances
            for _ in range(unrated_user_ratio):
                # Randomly select a negative item
                negative_item = np.random.choice(all_movie_ids)

                # Ensure the negative item is not in the total dataset and has not been visited
                while (user, negative_item) in total_user_item_set or negative_item in rated:
                    negative_item = np.random.choice(all_movie_ids)

                users_list.append(user)
                items_list.append(negative_item)
                rated.append(negative_item)
                labels_list.append(0.0)

        print(f"Not rated user sampled data size: {len(labels_list)}")
        return torch.tensor(users_list), torch.tensor(items_list), torch.tensor(labels_list)
    
    def __len__(self) -> int:
        return len(self.users)

    def __getitem__(self, index):
        return self.users[index], self.items[index], self.ratings[index]



## Multi Layer Perceptron (A Custom Model for Training)

In [4]:
class PretrainedModelLoader(nn.Module):
    def __init__(self):
        super(PretrainedModelLoader, self).__init__()

    def load_pretrained_model(self, model, pretrained_model):
        model.user_embedding.weight.data.copy_(pretrained_model.user_embedding.weight)
        model.movie_item_embedding.weight.data.copy_(pretrained_model.movie_item_embedding.weight)
        for layer, pretrained_layer in zip(model.multilayer_model, pretrained_model.multilayer_model):
            if isinstance(layer, nn.Linear) and isinstance(pretrained_layer, nn.Linear):
                layer.weight.data.copy_(pretrained_layer.weight)
                layer.bias.data.copy_(pretrained_layer.bias)

class UserRatingsMultiLayerPerceptron(nn.Module):
    def __init__(self,num_users:int,num_items:int,num_factor:int=8,total_hidden_layers=None,use_pretrained: bool = False,NeuralMF:bool = False,pretrained_model=None
                 ):
        super(UserRatingsMultiLayerPerceptron, self).__init__()

        if total_hidden_layers is None:
            total_hidden_layers = [128,64,32,16,8]

        self.pretrained_model = pretrained_model
        self.num_users = num_users
        self.num_items = num_items
        self.use_pretrained = use_pretrained
        self.user_embedding = nn.Embedding(num_users, total_hidden_layers[0] // 2)
        self.movie_item_embedding = nn.Embedding(num_items, total_hidden_layers[0] // 2)
        self.NeuralMF = NeuralMF
        multiple_hidden_layers = []

        for idx, layer_size in enumerate(total_hidden_layers):
            # Add layers to the MLP model
            multiple_hidden_layers.append(nn.Linear(layer_size, layer_size // 2))
            multiple_hidden_layers.append(nn.ReLU())

        # stack the layers
        self.multilayer_model = nn.Sequential(*multiple_hidden_layers)

        # initialize prediction layer
        self.predict_layer = nn.Linear(total_hidden_layers[-1] // 2, 1)
        self.sigmoid = nn.Sigmoid()

        if self.use_pretrained:
            PretrainedModelLoader().load_pretrained_model(self, self.pretrained_model)
        else:
            self._init_weights()
            
    # Initialize weights
    def _init_weights(self):
        if not self.use_pretrained:
            nn.init.normal_(self.user_embedding.weight, std=1e-2)
            nn.init.normal_(self.movie_item_embedding.weight, std=1e-2)
            for layer in self.multilayer_model:
                if isinstance(layer, nn.Linear):
                    nn.init.xavier_uniform_(layer.weight)
        if not self.NeuralMF:
            nn.init.normal_(self.predict_layer.weight, std=1e-2)

    def forward(self, user, item):
        '''
        function for forward propogation to generate aggregated signal
        '''
        # Convert user tensor to LongTensor
        user = user.long()
        item = item.long()

        # Embeding movie and user
        embedding_user = self.user_embedding(user)
        embedding_item = self.movie_item_embedding(item)

        # combining (concatenating) users and movie(item embeddings)
        concatinating_embed_input = torch.cat((embedding_user, embedding_item), dim=-1)

        # Forward pass through MLP layers
        aggregated_signal = self.multilayer_model(concatinating_embed_input)

        if not self.NeuralMF:
            # Forward Propogation with sigmoid activation function for classification
            aggregated_signal = self.predict_layer(aggregated_signal)
            aggregated_signal = self.sigmoid(aggregated_signal)
            aggregated_signal = aggregated_signal.view(-1)

        return aggregated_signal


## Evaluation


In [5]:
import numpy as np
import torch

def compute_hit_rate(gt_item, pred_items):

    if gt_item in pred_items:
        return 1
    return 0

def compute_ndcg(gt_item, pred_items):

    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index+2))
    return 0

def compute_metrics(model, test_loader, top_k, device):

    hit_rate, ndcg_score = [], []

    for user, item, label in test_loader:
        
        user = user.to(device) #moving user or item to device(CPU or GPU)
        item = item.to(device)

        # Get model predictions
        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)

        # Get recommended items
        recommends = torch.take(item, indices).cpu().numpy().tolist()

        # Computing hit rate and cumulative gain (NDCG) for each user
        gt_item = item[0].item()
        hit_rate.append(compute_hit_rate(gt_item, recommends))
        ndcg_score.append(compute_ndcg(gt_item, recommends))

    # Compute mean HR and NDCG
    hit_rate_mean = np.mean(hit_rate)
    mean_ndcg = np.mean(ndcg_score)

    return hit_rate_mean, mean_ndcg


## Training-Class

In [6]:
import torch
import numpy as np

class MLP_Trainer():
    def __init__(self, model, optimizer, epochs, dataloader, criterion, test_obj, device='cuda', print_cost=True):
        self.model = model
        self.optimizer = optimizer
        self.epochs = epochs
        self.dataloader = dataloader
        self.criterion = criterion
        self.device = device
        self.print_cost = print_cost
        self.test_obj = test_obj

    def train(self):
        model = self.model
        optimizer = self.optimizer
        total_epochs = self.epochs
        dataloader = self.dataloader
        criterion = self.criterion
        total_batch = len(dataloader)
        device = self.device
        test_obj = self.test_obj

        losses_list = []
        #iterating over epochs
        for epoch in range(total_epochs):
            for user, item, target in dataloader:
                user, item, target = user.to(device), item.to(device), target.float().to(device)
                optimizer.zero_grad()
                pred = model(user, item)
                loss = criterion(pred, target)
                loss.backward()
                optimizer.step()

            if self.print_cost:
                hit_rate, NDCG_score = compute_metrics(model, test_obj, 10, device)
                print("Epoch: {} Hit_Rate: {:.3f}\tNDCG: {:.3f}".format(epoch, np.mean(hit_rate), np.mean(NDCG_score)))

            losses_list.append(loss.item())

        if self.print_cost:
            print('--------------Training Completed-----------')
        #returning list of training losses
        return losses_list


## Checking GPU/CPU 

In [7]:
# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

# Print GPU information
if torch.cuda.is_available():
    print('CUDA device:', torch.cuda.current_device())
    print('No of GPUs in use', torch.cuda.device_count())

Device: cuda
CUDA device: 0
No of GPUs in use 1


## Model training

In [8]:
# path to save checkpoints (model)
pretrain_dir = 'models_trained'
if not os.path.isdir(pretrain_dir):
    os.makedirs(pretrain_dir)

# Load train, test, and total datasets
train_dataframe = pd.read_csv("/home/sikhakolli.v/rec/data/interim/train.csv")
total_dataframe = pd.read_csv("/home/sikhakolli.v/rec/data/interim/entire_dataset.csv")
test_dataframe = pd.read_csv("/home/sikhakolli.v/rec/data/interim/evaluation.csv")


## DataLoaders

In [9]:

# Create MovieLens datasets
train_set = MovieUserRatingDataset(rated_users=train_dataframe, total_samples=total_dataframe, unrated_user_ratio=4)
test_set = MovieUserRatingDataset(rated_users=test_dataframe, total_samples=total_dataframe, unrated_user_ratio=99)

print(f"{type(train_set)}:{train_set}")
# Get number of unique users and movies
max_num_users, max_num_items = total_dataframe['userId'].max() + 1, total_dataframe['movieId'].max() + 1
print('Data loaded successfully!')

# Data Loaders for effecient loading
dataloader_train = DataLoader(dataset=train_set, batch_size=32, shuffle=True, num_workers=0)
dataloader_test = DataLoader(dataset=test_set, batch_size=100, shuffle=False, num_workers=0, drop_last=True)

# Initializing model
model = UserRatingsMultiLayerPerceptron(num_users=max_num_users, num_items=max_num_items, NeuralMF=False)

# Print model summary
print(model)

#optimizer
optimizer = optim.Adam(model.parameters())
model.to(device)

#Using Binary cross entropy loss
criterion = torch.nn.BCELoss()
save_model = True

Not rated user sampled data size: 501130
Not rated user sampled data size: 61000
<class '__main__.MovieUserRatingDataset'>:<__main__.MovieUserRatingDataset object at 0x2afb618bfdf0>
Data loaded successfully!
UserRatingsMultiLayerPerceptron(
  (user_embedding): Embedding(611, 64)
  (movie_item_embedding): Embedding(193610, 64)
  (multilayer_model): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=16, bias=True)
    (5): ReLU()
    (6): Linear(in_features=16, out_features=8, bias=True)
    (7): ReLU()
    (8): Linear(in_features=8, out_features=4, bias=True)
    (9): ReLU()
  )
  (predict_layer): Linear(in_features=4, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [10]:
#visualizing tensors
for i in range(5):  # Printing first 5 samples
    sample = train_set[i]
    print(f"Sample {i+1}: {sample}")


Sample 1: (tensor(89), tensor(3604), tensor(1.))
Sample 2: (tensor(89), tensor(1621), tensor(0.))
Sample 3: (tensor(89), tensor(6058), tensor(0.))
Sample 4: (tensor(89), tensor(45), tensor(0.))
Sample 5: (tensor(89), tensor(4433), tensor(0.))


## Initializing Model Trainer

In [None]:
import torch
import os
import time
import numpy as np

# Define the training class
class Trainer:
    def __init__(self, model, optimizer, criterion, epochs, train_loader, test_loader, device='cuda', print_cost=True):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.epochs = epochs
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.device = device
        self.print_cost = print_cost

    def train(self):
        self.model.to(self.device)
        self.model.train()

        for epoch in range(self.epochs):
            running_loss = 0.0
            for user, item, target in self.train_loader:
                user, item, target = user.to(self.device), item.to(self.device), target.float().to(self.device)

                self.optimizer.zero_grad()

                outputs = self.model(user, item)
                loss = self.criterion(outputs, target)

                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()

            if self.print_cost:
                print(f'Epoch [{epoch + 1}/{self.epochs}], Loss: {running_loss / len(self.train_loader):.4f}')

    def model_evaluate(self, top_k=10):
        self.model.eval()
        HR, NDCG = [], []

        with torch.no_grad():
            for user, item, _ in self.test_loader:
                user, item = user.to(self.device), item.to(self.device)

                predictions = self.model(user, item)
                _, indices = torch.topk(predictions, top_k)

                recommended_items = torch.take(item, indices).cpu().numpy().tolist()
                gt_item = item[0].item()

                HR.append(compute_hit_rate(gt_item, recommended_items))
                NDCG.append(compute_ndcg(gt_item, recommended_items))

        return np.mean(HR), np.mean(NDCG)

print("Defined Trainer Class")
# Define the training parameters
model = UserRatingsMultiLayerPerceptron(num_users=max_num_users, num_items=max_num_items, NeuralMF=False)
optimizer = optim.Adam(model.parameters())
criterion = torch.nn.BCELoss()
epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print_cost = True

# Create and initialize the Trainer object
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  criterion=criterion,
                  epochs=epochs,
                  train_loader=dataloader_train,
                  test_loader=dataloader_test,
                  device=device,
                  print_cost=print_cost)

print("starting to train")
# Train the model
start_time = time.time()
trainer.train()
end_time = time.time()
print(f'Training time: {end_time - start_time:.5f} seconds')

# Save the model if required
if save_model:
    pretrain_model_dir = os.path.join(pretrain_dir, "MLP.pth")
    torch.save(model, pretrain_model_dir)

# testing the trained model with test data and getting top 10 results
HR, NDCG = trainer.model_evaluate(top_k=10)
print(f'HR: {np.mean(HR):.3f}, NDCG: {np.mean(NDCG):.3f}')


Defined Trainer Class
starting to train
Epoch [1/10], Loss: 0.3549
Epoch [2/10], Loss: 0.3181
Epoch [3/10], Loss: 0.2794
Epoch [4/10], Loss: 0.2523
Epoch [5/10], Loss: 0.2342
Epoch [6/10], Loss: 0.2180
Epoch [7/10], Loss: 0.2037


## Inferring the trained model with adhoc data

In [None]:
def get_rec_for_single_user(model, user_id, top_k, total_dataframe, test_loader, device):
    for user, item, label in test_loader:
        if user[0] == user_id:
            #moving data to GPU if available
            user = user.to(device)
            item = item.to(device)

            print(f"Given User {user_id} is found in the test dataset.")
            predictions = model(user, item)
            _, indices = torch.topk(predictions, top_k)

            recommends = torch.take(item, indices).cpu().numpy().tolist()

            #extract titles of the movie recommendations
            recommended_titles = [total_dataframe[total_dataframe['movieId'] == rec]['title'].values[0] for rec in recommends]

            return {'user': user.item() if user.numel() == 1 else user.tolist(),
                    'movie_recommendations': recommended_titles}


In [None]:
user_id_to_infer = 453 
recommendations = get_rec_for_single_user(model, user_id_to_infer, top_k=10, total_dataframe=total_dataframe, test_loader=dataloader_test, device=device)
print(f"User: {recommendations['user'][0]}, Recommendations: {recommendations['movie_recommendations']}")

## Recommendation with score

In [None]:
def get_recommendation_score(model, test_loader, top_k, total_dataframe, device):

    HR, NDCG, rec_movies = [], [], []

    for user, item, label in test_loader:
        user = user.to(device)
        item = item.to(device)

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)

        recommends = torch.take(item, indices).cpu().numpy().tolist()

        gt_item = item[0].item()
        HR.append(compute_hit_rate(gt_item, recommends))
        NDCG.append(compute_ndcg(gt_item, recommends))

        # Get movie titles for the recommended movies
        recommended_titles = [total_dataframe[total_dataframe['movieId'] == rec]['title'].values[0] for rec in recommends]

        rec_movies.append({
            'user': user.item() if user.numel() == 1 else user.tolist(),
            'ground_truth': total_dataframe[total_dataframe['movieId'] == gt_item]['title'].values[0],
            'recommendations': recommended_titles
        })

    return np.mean(HR), np.mean(NDCG), rec_movies

HR, NDCG, rec_movies = get_recommendation_score(model, test_loader=dataloader_test, top_k=10, total_dataframe=total_dataframe, device=device)

print("Hit Rate:", HR)
print("NDCG_score:", NDCG)

# Print individual recommendations with movie titles
for rec in rec_movies:
    print(f"User: {rec['user'][0]}, Recommendations: {rec['recommendations']}")
    break

In [None]:
total_dataframe.head()

## Re-Ranking Recommendations to Get Best Performance

In [None]:
def get_rec_for_single_user(model, user_id, top_k, total_dataframe, test_loader, device):
    ground_truth = []  # Ground truth items for the user
    recommended_items = []  # Recommended items for the user

    for user, item, label in test_loader:
        if user[0] == user_id:
            # Move data to GPU if available
            user = user.to(device)
            item = item.to(device)
            
            print(f"Given User {user_id} is found in the test dataset.")
            predictions = model(user, item)
            _, indices = torch.topk(predictions, top_k)
            
            recommends = torch.take(item, indices).cpu().numpy().tolist()
            
            # Calculate popularity score for each recommended movie
            popularity_scores = [total_dataframe[total_dataframe['movieId'] == rec].shape[0] for rec in recommends]
            
            # Sort recommendations based on popularity score
            sorted_indices = np.argsort(popularity_scores)[::-1]  # Sort in descending order
            re_ranked_recommendations = [recommends[i] for i in sorted_indices]
            re_ranked_scores = [popularity_scores[i] for i in sorted_indices]
            
            # Extract titles of the re-ranked movie recommendations
            re_ranked_titles = [total_dataframe[total_dataframe['movieId'] == rec]['title'].values[0] for rec in re_ranked_recommendations]
            
            ground_truth.append(item[0].item())  # Ground truth item for this user
            recommended_items.append(re_ranked_recommendations)  # Recommended items for this user
            
            return {'user': user.item() if user.numel() == 1 else user.tolist(),
                    'movie_recommendations': re_ranked_titles,
                    'scores': re_ranked_scores,
                    'ground_truth': ground_truth,
                    'recommended_items': recommended_items}

        
def compute_reranked_metrics(ground_truth, recommended_items, top_k):
    hr = np.zeros(len(ground_truth))
    ndcg = np.zeros(len(ground_truth))

    for i in range(len(ground_truth)):
        gt_item = ground_truth[i]
        rec_items = recommended_items[i]

        if gt_item in rec_items:
            hr[i] = 1

            index = rec_items.index(gt_item)
            ndcg[i] = 1 / np.log2(index + 2)

    return np.mean(hr), np.mean(ndcg)

# Example usage
user_id_to_infer = 453 
recommendations = get_rec_for_single_user(model, user_id_to_infer, top_k=10, total_dataframe=total_dataframe, test_loader=dataloader_test, device=device)
print(f"User: {recommendations['user']}, Recommendations: {recommendations['movie_recommendations']}, Score: {recommendations['scores']}")

# Calculate HR and NDCG
hr_score, ndcg_score = compute_reranked_metrics(recommendations['ground_truth'], recommendations['recommended_items'], top_k=10)
print("Hit Rate:", hr_score)
print("NDCG Score:", ndcg_score)


In [None]:
# Example usage
user_id_to_infer = 572
recommendations = get_rec_for_single_user(model, user_id_to_infer, top_k=10, total_dataframe=total_dataframe, test_loader=dataloader_test, device=device)
print(f"User: {recommendations['user']}, Recommendations: {recommendations['movie_recommendations']}, Score: {recommendations['scores']}")

# Calculate HR and NDCG
hr_score, ndcg_score = compute_reranked_metrics(recommendations['ground_truth'], recommendations['recommended_items'], top_k=10)
print("Hit Rate:", hr_score)
print("NDCG Score:", ndcg_score)