# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl

np.random.seed(123)

In [2]:
ratings = pd.read_csv('../Data/rating.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


# Data Preprocessing

## Cleaning

We can see that the dataset has 'timestamp' variable, which is not really a usefull feature when doing recomendations, thus we can drop this column enterly.

After doing this we will be left with userId, movieId, and ratings, these are the only 3 features we will need for our Collaborative Filtering model.

In [4]:
ratings.drop('timestamp', axis=1, inplace=True)

In [5]:
## Let's see if there are any missing values in the dataset
ratings.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

## Preprocessing for Collaborative Filtering

Since there are no null values in the dataset, we can continue with the dataset without any issues

Now it's time to preprocess the data for the 0-centered cosine, since that is the type of similartiy metric used for Collaborative Filtering 

Being a huge dataset, it will be difficult to preform calculations on this dataset. Thus we will be reducing the size of the datase. So we can sample the dataset to only use 10% of actual data

In [6]:
sampled_ratings = ratings.sample(frac=0.15)

After that, let's filter out the data with users with number ratings less than 50, and items with number of ratings less than 10

In [7]:
sampled_ratings.groupby('userId').filter(lambda x: len(x) >= 50)
sampled_ratings.groupby('movieId').filter(lambda x: len(x) >= 10)

Unnamed: 0,userId,movieId,rating
17269281,119436,71579,5.0
7233670,49921,1196,5.0
1828053,12340,193,2.0
2168558,14684,91630,4.5
18667035,129238,2407,2.5
...,...,...,...
4982976,34268,3578,3.5
19589773,135597,2440,4.5
10864255,75124,19,1.0
7576560,52225,4308,5.0


## Create User-Item embeddings

In [8]:
num_users = sampled_ratings.userId.nunique()
num_items = sampled_ratings.movieId.nunique()
embedding_size = 512 ## Embedding size is a hyperparameter, start of small, and increase gradualy

Create a Class to process the data and access it in a tensor

In [9]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = df['userId'].cat.codes.values
        self.items = df['movieId'].cat.codes.values
        self.ratings = df['rating'].values.astype(np.float32)
        mean_ratings_by_users = df.groupby('userId').rating.transform(lambda x: x.mean())
        self.ratings -= mean_ratings_by_users.values
        
    def __len__(self):
        return len(self.ratings)
        
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

In [11]:
batch_size = 4000

## Let's use RatingsDataset class to get 0-centered data
sampled_ratings['userId'] = sampled_ratings.userId.astype('category')
sampled_ratings['movieId'] = sampled_ratings.movieId.astype('category')

# Create the ratings dataset and split into training and test sets
dataset = RatingsDataset(sampled_ratings)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [12]:
dataset.__getitem__(100)

(99856, 15837, -0.45588234)

# Create the Collaborative Filtering model

In [18]:
class CFModel(nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(CFModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc1 = nn.Linear(embedding_size*2, 64)
        self.fc2 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.25)
        
    def forward(self, user, item):
        ## Create a item and user embedding vector, and then concatinate them to create a user-item embedding
        user_embedd = self.user_embedding(user.long())
        item_embedd = self.item_embedding(item.long())
        x = torch.cat([user_embedd, item_embedd], dim=-1)
        ## Pass through the fully connected layers
        x = self.dropout(x)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc2(x))
        return x.squeeze()

In [19]:
# Defining hyperparameters
learning_rate = 0.01
epochs = 30

# Initialize Model
model = CFModel(num_users, num_items, embedding_size)

# Define loss function
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
for epoch in range(epochs):
    running_loss = 0
    for i, data in enumerate(train_loader):
        user, item, rating = data
        output = model(user, item)
        optimizer.zero_grad()
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    traning_loss = running_loss/len(train_loader)
    print('Epoch %d of %d, Traning loss: %.4f' % (epoch+1, epochs, traning_loss))

Epoch 1 of 30, Traning loss: 0.8323
Epoch 2 of 30, Traning loss: 0.8103
Epoch 3 of 30, Traning loss: 0.7970
Epoch 4 of 30, Traning loss: 0.7866
Epoch 5 of 30, Traning loss: 0.7772
Epoch 6 of 30, Traning loss: 0.7699
Epoch 7 of 30, Traning loss: 0.7630
Epoch 8 of 30, Traning loss: 0.7575
Epoch 9 of 30, Traning loss: 0.7541
Epoch 10 of 30, Traning loss: 0.7498
Epoch 11 of 30, Traning loss: 0.7462
Epoch 12 of 30, Traning loss: 0.7432
Epoch 13 of 30, Traning loss: 0.7421
Epoch 14 of 30, Traning loss: 0.7391
Epoch 15 of 30, Traning loss: 0.7378
Epoch 16 of 30, Traning loss: 0.7363
Epoch 17 of 30, Traning loss: 0.7343
Epoch 18 of 30, Traning loss: 0.7331
Epoch 19 of 30, Traning loss: 0.7320
Epoch 20 of 30, Traning loss: 0.7304
Epoch 21 of 30, Traning loss: 0.7298
Epoch 22 of 30, Traning loss: 0.7283
Epoch 23 of 30, Traning loss: 0.7270
Epoch 24 of 30, Traning loss: 0.7265
Epoch 25 of 30, Traning loss: 0.7255
Epoch 26 of 30, Traning loss: 0.7251
Epoch 27 of 30, Traning loss: 0.7341
Epoch 28 o

In [21]:
test_loss = 0
with torch.no_grad():
    for user, item, rating in test_loader:
        output = model(user, item) 
        loss = criterion(output, rating)
        test_loss += loss.item()
print('Test Loss: %.4f' % test_loss)

Test Loss: 129.1781
