In [1]:
!pip install pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.8.5.post0-py3-none-any.whl (800 kB)
[K     |████████████████████████████████| 800 kB 5.0 MB/s 
Collecting tensorboardX>=2.2
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 113.5 MB/s 
[?25hCollecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 kB 69.8 MB/s 
Collecting lightning-utilities!=0.4.0,>=0.3.0
  Downloading lightning_utilities-0.4.2-py3-none-any.whl (16 kB)
Installing collected packages: torchmetrics, tensorboardX, lightning-utilities, pytorch-lightning
Successfully installed lightning-utilities-0.4.2 pytorch-lightning-1.8.5.post0 tensorboardX-2.5.1 torchmetrics-0.11.0


# New Section

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(13)

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [5]:
import pandas as pd

urlfile="https://raw.githubusercontent.com/AALAM98mod100/steam-recommendation-system/main/final_dataset1.csv"
df = pd.read_csv(urlfile)
df.sample(5)

Unnamed: 0,user_id,name,rating,appid
18921,122798021,terraria,1,105600
18871,11403772,terraria,1,105600
20964,39037163,borderlands 2,5,49520
9217,67492611,grand theft auto v,3,271590
10959,94851051,deus ex human revolution,5,238010


In [6]:
df["user_id"]

0        151603712
1         59945701
2         92107940
3        250006052
4         11373749
           ...    
30988     51822361
30989     38317154
30990     36404933
30991     87201181
30992     34901647
Name: user_id, Length: 30993, dtype: int64

In [7]:
user_dict = dict(zip(df["user_id"].unique(), range(len(df["user_id"].unique()))))
game_dict = dict(zip(df["appid"].unique(), range(len(df["appid"].unique()))))
len(user_dict)

7865

In [8]:
len(game_dict)

327

In [9]:
df['user_id'] = df["user_id"].map(lambda idx: user_dict[idx])
df['appid'] = df["appid"].map(lambda idx: game_dict[idx])

In [10]:
df

Unnamed: 0,user_id,name,rating,appid
0,0,the elder scrolls v skyrim,5,0
1,1,the elder scrolls v skyrim,3,0
2,2,the elder scrolls v skyrim,5,0
3,3,the elder scrolls v skyrim,5,0
4,4,the elder scrolls v skyrim,5,0
...,...,...,...,...
30988,518,warhammer 40000 dawn of war soulstorm,5,326
30989,531,warhammer 40000 dawn of war soulstorm,2,326
30990,678,warhammer 40000 dawn of war soulstorm,3,326
30991,611,warhammer 40000 dawn of war soulstorm,5,326


In [11]:
# remove the name column since it will only be used later in the ranking
df = df.drop(columns = 'name')

In [12]:
from sklearn.model_selection import train_test_split
x = df.drop(columns = ['rating']).copy()
y = df['rating']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.9)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(27893, 2) (27893,)
(3100, 2) (3100,)


In [13]:
x_train['interaction'] = 1
x_train.sample(5)

Unnamed: 0,user_id,appid,interaction
21689,689,138,1
17270,5342,90,1
7179,5172,12,1
3477,1225,7,1
23054,485,149,1


In [14]:
# Get a list of all game IDs
game_id = df["appid"].unique()
# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(x_train['user_id'], x_train['appid']))

# 4:1 ratio of negative to positive samples
num_negatives = 9

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):

        negative_item = np.random.choice(game_id) 

        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(game_id)
        users.append(u)
        items.append(negative_item)
        # items not interacted with are negative
        labels.append(0) 

  0%|          | 0/27892 [00:00<?, ?it/s]

In [15]:
len(labels) # 123965
len(users) 
len(items)

278920

In [16]:
class SteamTrainDataset(Dataset):
    """
    Args:
        df (pd.DataFrame): Dataframe containing ratings from steam
        game_id (list): List containing all game ids
    
    """

    def __init__(self, df, game_id):
        self.users, self.items, self.labels = self.get_dataset(df, game_id)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, df, game_id):
        users, items, labels = [], [], []
        user_item_set = set(zip(df['user_id'], df['appid']))

        num_negatives = 9
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(game_id)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(game_id)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        #return torch.tensor(users).to(device), torch.tensor(items).to(device), torch.tensor(labels).to(device)
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [17]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the steam ratings for training !!!!!!!!!!!!!!!!!!!!!!!!!
            game_id (list): List containing all game ids
    """
    
    def __init__(self, num_users, num_items, df, game_id):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=8)
        self.fc3 = nn.Linear(in_features=8, out_features=16)
        self.fc4 = nn.Linear(in_features=16, out_features=64)
        self.fc5 = nn.Linear(in_features=64, out_features=8)
        self.fc6 = nn.Linear(in_features=8, out_features=16)
        self.fc7 = nn.Linear(in_features=16, out_features=64)
        self.fc8 = nn.Linear(in_features=64, out_features=8)
        self.fc9 = nn.Linear(in_features=8, out_features=16)
        self.fc10 = nn.Linear(in_features=16, out_features=64)
        self.fc11 = nn.Linear(in_features=64, out_features=8)
        self.fc12 = nn.Linear(in_features=8, out_features=16)
        self.fc13 = nn.Linear(in_features=16, out_features=32)
        self.fc14 = nn.Linear(in_features=32, out_features=32)
        self.fc15 = nn.Linear(in_features=32, out_features=8)
        self.fc16 = nn.Linear(in_features=8, out_features=16)
        self.output = nn.Linear(in_features=16, out_features=1)  
        self.df = df
        self.game_id = game_id
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        vector = nn.ReLU()(self.fc3(vector))
        vector = nn.ReLU()(self.fc4(vector))
        vector = nn.ReLU()(self.fc5(vector))
        vector = nn.ReLU()(self.fc6(vector))
        vector = nn.ReLU()(self.fc7(vector))
        vector = nn.ReLU()(self.fc8(vector))
        vector = nn.ReLU()(self.fc9(vector))
        vector = nn.ReLU()(self.fc10(vector))
        vector = nn.ReLU()(self.fc11(vector))
        vector = nn.ReLU()(self.fc12(vector))
        vector = nn.ReLU()(self.fc13(vector))
        vector = nn.ReLU()(self.fc14(vector))
        vector = nn.ReLU()(self.fc15(vector))
        vector = nn.ReLU()(self.fc16(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(SteamTrainDataset(self.df, self.game_id),
                          batch_size=1024, num_workers=4)

In [18]:
num_users = len(df['user_id'].unique()) + 2000
num_items = len(df['appid'].unique()) + 300

In [19]:
game_id = df['appid'].unique()
model = NCF(num_users, num_items, x_train, game_id)
model.to(device)

NCF(
  (user_embedding): Embedding(9865, 8)
  (item_embedding): Embedding(627, 8)
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=8, bias=True)
  (fc6): Linear(in_features=8, out_features=16, bias=True)
  (fc7): Linear(in_features=16, out_features=64, bias=True)
  (fc8): Linear(in_features=64, out_features=8, bias=True)
  (fc9): Linear(in_features=8, out_features=16, bias=True)
  (fc10): Linear(in_features=16, out_features=64, bias=True)
  (fc11): Linear(in_features=64, out_features=8, bias=True)
  (fc12): Linear(in_features=8, out_features=16, bias=True)
  (fc13): Linear(in_features=16, out_features=32, bias=True)
  (fc14): Linear(in_features=32, out_features=32, bias=True)
  (fc15): Linear(in_features=32, out_features=8, bias=True)
  (fc16): Linear(in_f

In [20]:
trainer = pl.Trainer(max_epochs=5, accelerator='gpu', devices=1)

trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
   | Name           | Type      | Params
----------------------------------------------
0  | user_embedding | Embedding | 78.9 K
1  | item_embedding | Embedding | 5.0 K 
2  | fc1            | Linear    | 1.1 K 
3  | fc2            | Linear    | 520   
4  | fc3            | Linear    | 144   
5  | fc4            | Linear    | 1.1 K 
6  | fc5            | Linear    | 520   
7  | fc6            | Linear    | 144   
8  | fc7            | Linear    | 1.1 K 
9  | fc8            | Linear    | 520   
10 | fc9            | Linear    | 144   
11 | f

Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [21]:
# User-item pairs for testing
test_user_item_set = set(zip(x_test['user_id'], x_test['appid']))

# Dict of all items that are interacted with by each user
user_interacted_items = df.groupby('user_id')['appid'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(game_id) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/3100 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.51


In [22]:
#user_interacted_items