In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
np.random.seed(123)

ModuleNotFoundError: ignored

In [None]:
pip install pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Importing Dataset

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/BX-Book-Ratings.csv',sep=';',encoding= 'latin-1"',error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
ratings.sample(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
229535,52917,1558534202,6
817972,198711,394426290,0
101577,23768,375412530,0
1123282,269566,446605484,0
339655,81050,1586480375,7


# Level Encoding ISBN

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

ratings['ISBN']=le.fit_transform(ratings['ISBN'])

ratings.sample(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
1117414,268110,43848,0
654121,158295,219487,3
1073456,256575,108354,9
826687,199827,301778,6
364938,87712,135743,0


## Getting Unique lists of user from rows

In [None]:
rand_userIds = np.random.choice(ratings['User-ID'].unique(),
                                size=int(len(ratings['User-ID'].unique())*0.7),
                                replace=False)

ratings = ratings.loc[ratings['User-ID'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

There are 785247 rows of data from 73698 users


In [None]:
ratings.sample(5)

Unnamed: 0,User-ID,ISBN,Book-Rating,rank_latest
49392,11676,95350,10,9539.0
1115283,267511,139507,0,17.0
1122819,269566,73882,0,1408.0
926730,225293,197302,4,2.0
1051936,251422,91368,0,452.0


# Splitting Train & Test

In [None]:
ratings['rank_latest'] = ratings.groupby(['User-ID'])['ISBN'] \
                                .rank(method='first', ascending=False)


In [None]:
ratings['rank_latest']

1          1.0
2          1.0
3          2.0
4          1.0
6          1.0
          ... 
1149773    3.0
1149774    2.0
1149775    1.0
1149777    1.0
1149778    1.0
Name: rank_latest, Length: 785247, dtype: float64

In [None]:
train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

In [None]:
train_ratings = train_ratings[['User-ID', 'ISBN', 'Book-Rating']]
test_ratings = test_ratings[['User-ID', 'ISBN', 'Book-Rating']]

In [None]:
train_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
3,276729,127253,3
10,276746,94061,0
11,276746,110080,0
12,276746,138308,0
13,276746,138447,0
...,...,...,...
1149770,276704,166238,0
1149771,276704,180973,7
1149772,276704,202229,0
1149773,276704,204799,5


In [None]:
train_ratings.loc[:,'Book-Rating'] = 1

In [None]:
train_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
3,276729,127253,1
10,276746,94061,1
11,276746,110080,1
12,276746,138308,1
13,276746,138447,1
...,...,...,...
1149770,276704,166238,1
1149771,276704,180973,1
1149772,276704,202229,1
1149773,276704,204799,1


Inserting Negative Numbers

In [None]:
all_restaurentIds = ratings['ISBN'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['User-ID'], train_ratings['ISBN']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_restaurentIds)
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_restaurentIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

  0%|          | 0/711549 [00:00<?, ?it/s]

# Dataset

In [None]:
class BookTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training

    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_bookIds (list): List containing all movieIds

    """

    def __init__(self, ratings, all_bookIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_bookIds)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_bookIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['User-ID'], ratings['ISBN']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_bookIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_bookIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

NCF model by Lightning(Provided)

In [None]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the book ratings for training
            all_bookIds (list): List containing all bookIds (train + test)
    """

    def __init__(self, num_users, num_items, ratings, all_bookIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_bookIds = all_bookIds

    def forward(self, user_input, item_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(BookTrainDataset(self.ratings, self.all_bookIds),
                          batch_size=512, num_workers=4)

NCF model by Lightning(Created)

In [None]:
class NCF1(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the book ratings for training
            all_bookIds (list): List containing all bookIds (train + test)
    """

    def __init__(self, num_users, num_items, ratings, all_bookIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)

        self.output = nn.Linear(in_features=64, out_features=1)
        self.ratings = ratings
        self.all_bookIds = all_bookIds

    def forward(self, user_input, item_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.Sigmoid()(self.fc1(vector))
        vector = nn.Sigmoid()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),lr=0.001)

    def train_dataloader(self):
        return DataLoader(BookTrainDataset(self.ratings, self.all_bookIds),
                          batch_size=256, num_workers=8)

In [None]:
class NCF2(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the book ratings for training
            all_bookIds (list): List containing all bookIds (train + test)
    """

    def __init__(self, num_users, num_items, ratings, all_bookIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=32)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=32)
        self.fc1 = nn.Linear(in_features=32, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)

        self.output = nn.Linear(in_features=128, out_features=2)
        self.ratings = ratings
        self.all_bookIds = all_bookIds

    def forward(self, user_input, item_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-2)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Softmax()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(BookTrainDataset(self.ratings, self.all_bookIds),
                          batch_size=256, num_workers=8)

In [None]:
class NCF3(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the book ratings for training
            all_bookIds (list): List containing all bookIds (train + test)
    """

    def __init__(self, num_users, num_items, ratings, all_bookIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=32)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=32)
        self.fc1 = nn.Linear(in_features=64, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)

        self.output = nn.Linear(in_features=128, out_features=1)
        self.ratings = ratings
        self.all_bookIds = all_bookIds

    def forward(self, user_input, item_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Softmax()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(BookTrainDataset(self.ratings, self.all_bookIds),
                          batch_size=128, num_workers=8)

In [None]:
num_users = ratings['User-ID'].max()+1
num_items = ratings['ISBN'].max()+1

all_bookIds = ratings['ISBN'].unique()

model = NCF1(num_users, num_items,ratings,all_bookIds)

# Training Model

In [None]:
trainer = pl.Trainer(max_epochs=5, gpus=1,
                   logger=False,reload_dataloaders_every_n_epochs=True,enable_progress_bar=True,enable_checkpointing=True)

trainer.fit(model)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 2.2 M 
1 | item_embedding | Embedding | 2.7 M 
2 | fc1            | Linear    | 2.2 K 
3 | fc2            | Linear    | 8.3 K 
4 | output         | Linear    | 65    
---------------------------------------------
5.0 M     Trainable params
0         Non-trainable param

Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


# Checking Hit score

In [None]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['User-ID'], test_ratings['ISBN']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('User-ID')['ISBN'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_restaurentIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]

    predicted_labels = np.squeeze(model(torch.tensor([u]*100),
                                        torch.tensor(test_items)).detach().numpy())

    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]

    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)

print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

NameError: ignored