# Import Libraries

In [34]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

In [2]:
ratings = pd.read_csv('../Data/rating.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


# Data Preprocessing

## Cleaning

We can see that the dataset has 'timestamp' variable, which is not really a usefull feature when doing recomendations, thus we can drop this column enterly.

After doing this we will be left with userId, movieId, and ratings, these are the only 3 features we will need for our Collaborative Filtering model.

In [4]:
ratings.drop('timestamp', axis=1, inplace=True)

In [5]:
## Let's see if there are any missing values in the dataset
ratings.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

## Preprocessing for Collaborative Filtering

Since there are no null values in the dataset, we can continue with the dataset without any issues

Now it's time to preprocess the data for the 0-centered cosine, since that is the type of similartiy metric used for Collaborative Filtering 

Being a huge dataset, it will be difficult to preform calculations on this dataset. Thus we will be reducing the size of the datase. So we can sample the dataset to only use 10% of actual data

In [6]:
sampled_ratings = ratings.sample(frac=0.1)

After that, let's filter out the data with users with number ratings less than 50, and items with number of ratings less than 10

In [7]:
sampled_ratings.groupby('userId').filter(lambda x: len(x) >= 50)
sampled_ratings.groupby('movieId').filter(lambda x: len(x) >= 10)

Unnamed: 0,userId,movieId,rating
17269281,119436,71579,5.0
7233670,49921,1196,5.0
1828053,12340,193,2.0
2168558,14684,91630,4.5
18667035,129238,2407,2.5
...,...,...,...
8916196,61627,8784,2.5
12954218,89428,1093,2.5
993793,6719,867,0.5
2412814,16303,1272,4.5


## Create User-Item embeddings

In [13]:
num_users = sampled_ratings.userId.nunique()
num_items = sampled_ratings.movieId.nunique()
embedding_size = 64 ## Embedding size is a hyperparameter, start of small, and increase gradualy

Create a Class to process the data and access it in a tensor

In [24]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        super(RatingsDataset, self).__init__()
        self.users = df.userId.cat.codes.values
        self.items = df.movieId.cat.codes.values
        self.ratings = df.rating
        mean_ratings_by_users = df.groupby('userId').rating.transform(lambda x: x.mean())
        self.ratings -= mean_ratings_by_users
        
    def __len__(self):
        return len(self.ratings)
        
    def get_item(self, idx):
        return (self.users[idx], self.items[idx]), self.ratings[idx]

In [36]:
## Let's use RatingsDataset class to get 0-centered data
sampled_ratings['userId'] = sampled_ratings.userId.astype('category')
sampled_ratings['movieId'] = sampled_ratings.movieId.astype('category')

# Create the ratings dataset and split into training and test sets
dataset = RatingsDataset(sampled_ratings)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create batch size, and DataLoaders
batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Create the Collaborative Filtering model

In [None]:
class CFModel(nn.Module):
    def __inti__(self, num_users, num_items, embedding_size):
        super.__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc1 = nn.Linear(embedding_size*2, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.25)
        
    def forward(self, user, item):
        ## Create a item and user embedding vector, and then concatinate them to create a user-item embedding
        user_embedd = self.user_embedding(user, 64)
        item_embedd = self.item_embedding(item, 64)
        x = torch.cat([user_embedd, item_embedd], dim=-1)
        ## Pass through the fully connected layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        