# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, sampler
from sklearn.metrics import accuracy_score

np.random.seed(123)

In [2]:
ratings = pd.read_csv('../Data/rating.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


# Data Preprocessing

## Cleaning

We can see that the dataset has 'timestamp' variable, which is not really a usefull feature when doing recomendations, thus we can drop this column enterly.

After doing this we will be left with userId, movieId, and ratings, these are the only 3 features we will need for our Collaborative Filtering model.

In [4]:
ratings.drop('timestamp', axis=1, inplace=True)

In [5]:
## Let's see if there are any missing values in the dataset
ratings.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

## Preprocessing for Collaborative Filtering

Since there are no null values in the dataset, we can continue with the dataset without any issues

Now it's time to preprocess the data for the 0-centered cosine, since that is the type of similartiy metric used for Collaborative Filtering 

Being a huge dataset, it will be difficult to preform calculations on this dataset. Thus we will be reducing the size of the datase. So we can sample the dataset to only use 10% of actual data

In [6]:
sampled_ratings = ratings.sample(frac=0.15)

After that, let's filter out the data with users with number ratings less than 50, and items with number of ratings less than 10

In [7]:
sampled_ratings.groupby('userId').filter(lambda x: len(x) >= 50)
sampled_ratings.groupby('movieId').filter(lambda x: len(x) >= 10)

Unnamed: 0,userId,movieId,rating
17269281,119436,71579,5.0
7233670,49921,1196,5.0
1828053,12340,193,2.0
2168558,14684,91630,4.5
18667035,129238,2407,2.5
...,...,...,...
4982976,34268,3578,3.5
19589773,135597,2440,4.5
10864255,75124,19,1.0
7576560,52225,4308,5.0


## Create User-Item embeddings

Create a Class to process the data and access it in a tensor

In [8]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = df['userId'].cat.codes.values
        self.items = df['movieId'].cat.codes.values
        self.ratings = df['rating'].values.astype(np.float32)
        mean_ratings_by_users = df.groupby('userId').rating.transform(lambda x: x.mean())
        self.ratings -= mean_ratings_by_users.values
        
    def __len__(self):
        return len(self.ratings)
        
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

In [9]:
num_users = sampled_ratings.userId.nunique()
num_items = sampled_ratings.movieId.nunique()
embedding_size = 512 ## Embedding size is a hyperparameter, start of small, and increase gradualy

## Set up hyperparameters for making dataloaders
valid_size = 0.2
batch_size = 2000
num_workers = 0

In [10]:
## Let's use RatingsDataset class to get 0-centered data
sampled_ratings['userId'] = sampled_ratings.userId.astype('category')
sampled_ratings['movieId'] = sampled_ratings.movieId.astype('category')

## Create the ratings dataset and split into training and test sets
dataset = RatingsDataset(sampled_ratings)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [11]:
## Create validation dataset
indices = list(range(train_size))
np.random.shuffle(indices)
split = int(np.floor(valid_size*train_size))
train_idx, valid_idx = indices[split:], indices[:split]

In [12]:
## Create Random Samples for traning and validation datasets
train_sampler = sampler.SubsetRandomSampler(train_idx)
valid_sampler = sampler.SubsetRandomSampler(valid_idx)

In [13]:
## Create DataLoaders
train_loader = DataLoader(train_dataset, sampler=train_sampler,
                          num_workers=num_workers, batch_size=batch_size)
valid_loader = DataLoader(train_dataset, sampler=valid_sampler, 
                          num_workers=num_workers, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)

### Visualize the batches of datasets

In [14]:
dataiter = iter(train_loader)
user, item, rating = next(dataiter)

In [15]:
for idx in range(10):
    print(f"User {user[idx]}, movie {item[idx]}, rating {rating[idx]}")

User 48786, movie 2385, rating 0.0625
User 92981, movie 3467, rating 0.837837815284729
User 55882, movie 43, rating -1.5192307233810425
User 132490, movie 1604, rating -1.75
User 86324, movie 315, rating 0.75
User 21075, movie 504, rating 0.4699999988079071
User 99416, movie 2699, rating 0.2222222238779068
User 9544, movie 257, rating -1.5
User 27331, movie 2008, rating -0.1428571492433548
User 104192, movie 1382, rating 1.5921788215637207


# Create the Collaborative Filtering model

In [20]:
class CFModel(nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(CFModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc1 = nn.Linear(embedding_size*2, 256)
        self.fc2 = nn.Linear(256, 64)
        self.out = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.25)
        
    def forward(self, user, item):
        ## Create a item and user embedding vector, and then concatinate them to create a user-item embedding
        user_embedd = self.user_embedding(user.long())
        item_embedd = self.item_embedding(item.long())
        x = torch.cat([user_embedd, item_embedd], dim=-1)
        ## Pass through the fully connected layers
        x = self.dropout(x)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
#         x = self.dropout(x)
#         x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.sigmoid(self.out(x))
        return x.squeeze()

### Model Traning

In [21]:
device = torch.device("mps")
device

device(type='mps')

In [22]:
# Defining hyperparameters
learning_rate = 0.001
epochs = 30
valid_loss_min = np.Inf

# Initialize Model
model = CFModel(num_users, num_items, embedding_size)
model.to(device)

# Define loss function
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(1, epochs+1):
    train_loss = 0.0
    valid_loss = 0.0
    for data in train_loader:
        user, item, rating = data
        output = model(user.to(device), item.to(device))
        optimizer.zero_grad()
        loss = criterion(output, rating.to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    model.eval()
    for data in valid_loader:
        user, item, rating = data
        output = model(user.to(device), item.to(device))
        loss = criterion(output.to(device), rating.to(device))
        valid_loss += loss.item()
        
    train_loss = train_loss/len(train_loader)
    valid_loss = valid_loss/len(valid_loader)
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
    
    if valid_loss < valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min, valid_loss))
        torch.save(model.state_dict(), "model.cf.pt")
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.834749 	Validation Loss: 0.825189
Validation loss decreased (inf --> 0.825189).  Saving model ...


### Model Testing

In [None]:
## Load the saved model
model.load_state_dict(torch.load("model.cf.pt"))

In [None]:
test_loss = 0
acc = []
model.eval()
with torch.no_grad():
    for user, item, rating in test_loader:
        output = model(user, item) 
        loss = criterion(output, rating)
#         acc.append(accuracy_score(output, rating))
        test_loss += loss.item()
print('Test Loss: %.4f \tAccuracy: %.4f' % ((test_loss/len(test_loader)), (np.mean(acc))))