# Clean code for IMBD

### 1. Convert ratings data

Format should be user_id, item_id, rating
User_id and item_id should be consecutive
(for the sparse matrix to work)
The __init__ should have dictionaries to go from consecutive id to real id




In [4]:
import pandas as pd
import numpy as np

num_interactions = 100_000

query = f"""
SELECT *
FROM EVIC.ratings
WHERE rating > 3.0
LIMIT {num_interactions}
"""
ratings = pd.read_gbq(query, project_id="spike-sandbox",
                      use_bqstorage_api=True)

In [5]:
unique_users = ratings.user_id.unique()
n_unique_users = len(unique_users)
unique_movies = ratings.movie_id.unique()
n_unique_movies = len(unique_movies)

n_unique_users, n_unique_movies

(2771, 7017)

In [6]:
user_orig_id_to_consecutive_id_dict = {key:value 
                                       for key, value
                            in zip(unique_users, range(1, n_unique_users+1))}

movie_orig_id_to_consecutive_id_dict = {key:value 
                                       for key, value
                            in zip(unique_movies, range(1, n_unique_movies+1))}

In [7]:
consecutive_user_id = np.empty(len(ratings), dtype=np.int)

for j, orig_id in enumerate(ratings.user_id.values):
    consecutive_user_id[j] = user_orig_id_to_consecutive_id_dict[orig_id]

ratings['consecutive_user_id'] = consecutive_user_id

In [8]:
consecutive_movie_id = np.empty(len(ratings), dtype=np.int)

for j, orig_id in enumerate(ratings.movie_id.values):
    consecutive_movie_id[j] = movie_orig_id_to_consecutive_id_dict[orig_id]

ratings['consecutive_movie_id'] = consecutive_movie_id

In [9]:
#Warm start test set: erase random 10% of interactions
test_pct = 0.1
ratings['test'] = (np.random.random_sample(size=len(ratings)) > (1 - test_pct))*1
ratings['rating'] = 1

In [8]:
#Output: evic.test.rating, evic.train.rating
columns = ['consecutive_user_id', 'consecutive_movie_id', 'rating']
ratings.query("test == 1")[columns].to_csv("data/evic.test.rating",
                                           index=False, sep='\t', header=False)
ratings.query("test == 0")[columns].to_csv("data/evic.train.rating",
                                           index=False, sep='\t', header=False)


## Load dataset!

In [10]:
# PyTorch imports
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

# Python imports
import argparse
from time import time
import numpy as np
import pickle

# Workspace imports
from src.evaluate import evaluate_model
from src.Dataset import MovieLensDataset
from src.utils import train_one_epoch, test, plot_statistics

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [11]:
default_args_dict = {
    "path": "data/",
    "dataset": "evic", # :-)
    "epochs": 18,
    "batch_size": 256,
    "layers": [16, 32, 16, 8],
    'weight_decay': 0.00001,
    "num_neg_train": 4, #'Number of negative instances to pair 
                        #with a positive instance while training'
    "num_neg_test": 100,
    "lr": 0.001,
    "dropout": 0.,
    "learner": "adam",
    "verbose": 1,
    "out": 1 #save trained model or not
}

args = default_args_dict
path = args["path"]
dataset = args["dataset"]
layers = args["layers"]
weight_decay = args["weight_decay"]
num_negatives_train = args["num_neg_train"]
num_negatives_test = args["num_neg_test"]
dropout = args["dropout"]
learner = args["learner"]
learning_rate = args["lr"]
batch_size = args["batch_size"]
epochs = args["epochs"]
verbose = args["verbose"]

In [12]:
topK = 10
t1 = time()
full_dataset = MovieLensDataset(
    path + dataset, num_negatives_train=num_negatives_train,
    num_negatives_test=num_negatives_test)
train, testRatings, testNegatives = (full_dataset.trainMatrix,
                                     full_dataset.testRatings,
                                     full_dataset.testNegatives)
num_users, num_items = train.shape
print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
      % (time()-t1, num_users, num_items, train.nnz, len(testRatings)))

training_data_generator = DataLoader(
    full_dataset, batch_size=batch_size, shuffle=True, num_workers=0)



Load data done [10.9 s]. #user=2772, #item=7018, #train=90059, #test=9941


In [13]:
class MLP(nn.Module):

    def __init__(self, n_users, n_items, layers=[16, 8], dropout=False):
        """
        Simple Feedforward network with Embeddings for users and items
        """
        super().__init__()
        assert (layers[0] % 2 == 0), "layers[0] must be an even number"
        self.__alias__ = "MLP {}".format(layers)
        self.__dropout__ = dropout

        # user and item embedding layers
        embedding_dim = int(layers[0]/2)
        self.user_embedding = torch.nn.Embedding(n_users, embedding_dim)
        self.item_embedding = torch.nn.Embedding(n_items, embedding_dim)

        # list of weight matrices
        self.fc_layers = torch.nn.ModuleList()
        # hidden dense layers
        for _, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))
        # final prediction layer
        self.output_layer = torch.nn.Linear(layers[-1], 1)

    def forward(self, feed_dict):
        users = feed_dict['user_id']
        items = feed_dict['item_id']
        user_embedding = self.user_embedding(users)
        item_embedding = self.item_embedding(items)
        # concatenate user and item embeddings to form input
        x = torch.cat([user_embedding, item_embedding], 1)
        for idx, _ in enumerate(range(len(self.fc_layers))):
            x = self.fc_layers[idx](x)
            x = F.relu(x)
            x = F.dropout(x,  p=self.__dropout__, training=self.training)
        logit = self.output_layer(x)
        rating = torch.sigmoid(logit)
        return rating

    def predict(self, feed_dict):
        # return the score, inputs and outputs are numpy arrays
        for key in feed_dict:
            if type(feed_dict[key]) != type(None):
                feed_dict[key] = torch.from_numpy(
                    feed_dict[key]).to(dtype=torch.long, device=device)
        output_scores = self.forward(feed_dict)
        return output_scores.cpu().detach().numpy()

    def get_alias(self):
        return self.__alias__

In [14]:
model = MLP(num_users, num_items, layers=layers, dropout=dropout)
model.to(device)
if verbose:
    print(model)
    
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

# Record performance
hr_list = []
ndcg_list = []
BCE_loss_list = []

MLP(
  (user_embedding): Embedding(2772, 8)
  (item_embedding): Embedding(7018, 8)
  (fc_layers): ModuleList(
    (0): Linear(in_features=16, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=16, bias=True)
    (2): Linear(in_features=16, out_features=8, bias=True)
  )
  (output_layer): Linear(in_features=8, out_features=1, bias=True)
)


In [15]:
model.train()

data_loader = training_data_generator
for feed_dict in data_loader:
    for key in feed_dict:
        if type(feed_dict[key]) != type(None):
            feed_dict[key] = feed_dict[key].to(dtype = torch.long, device = device)
    # get the predictions
    prediction = model(feed_dict)
    # print(prediction.shape)
    # get the actual targets
    rating = feed_dict['rating']


    # convert to float and change dim from [batch_size] to [batch_size,1]
    rating = rating.float().view(prediction.size())  
    loss = loss_fn(prediction, rating)

In [20]:
len(data_loader)

1759

In [22]:
for i, feed_dict in enumerate(data_loader):    
    if i == 0:
        feed_dict_1 = feed_dict
    elif i == 3:
        feed_dict_2 = feed_dict
    

In [31]:
feed_dict_2

{'user_id': tensor([1875,  931,  535, 1948, 1957,  555,   13, 2127, 1461, 2510, 2555, 1615,
         1783, 2699,  974,   81, 1695, 1298, 2252, 1782, 1354,  412,  124, 2276,
         2378, 1834, 2000, 1120, 2488, 2349, 2052,  326, 2255,  313, 1101,   36,
         2424, 1433, 1410, 1139,  791, 1370, 1549,  358, 1587, 1399, 1763,  848,
          939, 2284, 2270, 1757, 2626, 1782, 1615, 1116, 1909, 2055, 1689,  822,
          118,   36, 1956,  224, 1228,    3,   34,  961, 2207,   11,  746, 1564,
         1117, 2647,  632,  118,  505, 1625,  371,  498,  174, 2468, 1668,  262,
         1121,  298,  103, 2116, 1292, 1708, 2603,  180, 1961, 2484,  331, 1446,
         2199, 2139, 1767,  974,  537, 2598, 2651, 1349, 2311, 2100, 1121, 1397,
         1121,  706, 1446, 2336, 1247,  326, 1535, 1202, 2046, 2728, 1980,  166,
         1436,  188, 2063, 1640,  969,  886, 2081, 1469, 1533,  627,  678, 2674,
          784, 1117, 2125, 2445, 1370, 1899,   68, 2100, 1555, 1177,  910, 2508,
         2608, 16

In [27]:
feed_dict_1['user_id']

tensor([1347, 1043,  467, 2662, 2370, 1779, 2556,   89, 1860,  390,  546, 1926,
        1753,  497,  455,  839, 1926, 1433, 2305, 2702, 2743, 1158,  465,   37,
         859, 1350, 1227, 2105, 1300,  118, 2530,  586, 2088, 1958,  564,   20,
          87,  169, 1344,  984, 2517, 1724, 2448, 2702,  609, 1446, 2532,  594,
          81, 1079, 2620, 1575, 1121, 1199, 1450, 2332,  704, 2547, 1832,  298,
        1121, 2054, 1655, 2428, 1512, 1453, 1733, 2767,  746, 1657, 1093, 2423,
        2635, 1121, 1958, 1494,  238, 1697, 1202,  788, 2100,  567, 1512, 2577,
         905, 2355, 1433,  301, 1499, 2145, 1399, 1202, 2350, 1074, 2085,  801,
         659,    5, 2356, 1629, 1738,  924,  908, 1005, 2248, 2387, 1010,  782,
         400,  459,  124, 2306, 1349, 1202,  974, 1344, 2548, 2448, 2233,  435,
        1287, 2357, 2445, 1886,  848, 1347, 1963, 1963, 1290,  703, 1480, 2355,
         439,  221, 1222,  912,  835, 2037, 1220, 2031, 2325,  129, 2618, 2409,
         155, 2660, 1067, 2660,  729, 13

In [14]:
# Check Init performance
hr, ndcg = test(model, full_dataset, topK)
hr_list.append(hr)
ndcg_list.append(ndcg)
BCE_loss_list.append(1)
# do the epochs now

for epoch in range(epochs):
    epoch_loss = train_one_epoch(model, training_data_generator,
                                 loss_fn, optimizer, epoch, device)

    if epoch % verbose == 0:
        hr, ndcg = test(model, full_dataset, topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
        BCE_loss_list.append(epoch_loss)
        # if hr > best_hr:
        #     best_hr, best_ndcg, best_iter = hr, ndcg, epoch
        #     if args.out > 0:
        #         model.save(model_out_file, overwrite=True)
print("hr for epochs: ", hr_list)
print("ndcg for epochs: ", ndcg_list)
print("loss for epochs: ", BCE_loss_list)

Eval: HR = 0.1010, NDCG = 0.0470 [8.9 s]
Epoch = 0
Epoch completed 6.2 s
Train Loss: 0.4724575046485903
doing epoch 0
Eval: HR = 0.4915, NDCG = 0.2594 [8.5 s]
Epoch = 1
Epoch completed 6.3 s
Train Loss: 0.373765311114418
doing epoch 1
Eval: HR = 0.5677, NDCG = 0.3115 [8.6 s]
Epoch = 2
Epoch completed 6.2 s
Train Loss: 0.3505717394550528
doing epoch 2
Eval: HR = 0.5753, NDCG = 0.3177 [8.5 s]
Epoch = 3
Epoch completed 6.2 s
Train Loss: 0.3420019286677809
doing epoch 3
Eval: HR = 0.5758, NDCG = 0.3171 [8.6 s]
Epoch = 4
Epoch completed 6.2 s
Train Loss: 0.33797582275763094
doing epoch 4
Eval: HR = 0.5765, NDCG = 0.3175 [8.5 s]
Epoch = 5
Epoch completed 6.1 s
Train Loss: 0.3354666643670637
doing epoch 5
Eval: HR = 0.5775, NDCG = 0.3178 [8.6 s]
Epoch = 6
Epoch completed 6.2 s
Train Loss: 0.33374034579061795
doing epoch 6
Eval: HR = 0.5750, NDCG = 0.3170 [8.5 s]
Epoch = 7
Epoch completed 6.1 s
Train Loss: 0.33213773711185823
doing epoch 7
Eval: HR = 0.5737, NDCG = 0.3168 [8.5 s]
Epoch = 8
Epo

KeyboardInterrupt: 

In [35]:
3 % verbose

0

In [None]:
2+2

In [27]:
2+2

4