In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from google.colab import drive
import os

In [None]:
data = pd.read_csv("https://drive.google.com/uc?id=1o0adqJ7RMP1M6Yr0XlgHmUgTMJtFoUZ_")

In [None]:
data = data.rename(columns={'users': 'User_ID'})
data = data.rename(columns={'items': 'Food_ID'})
data = data.rename(columns={'ratings': 'Rating'})
# for user in set(data['User_ID']):
#   print((data['User_ID'] == user).sum(), user)
data

5 A2Q3KZR7ZC1Q4T
8 A1C4X193GBJO6G
5 A1Q8F3F1EOBP8S
6 A1FT4X1UQJMS63
6 A9A92N6K4KF1I
6 A29R72Q5YA21Q6
6 A1WK9IG2EGBIVW
5 A1ZYA1GMFD04IX
5 A1AQ556FNSR6HJ
8 A1NV6GA1B5B4J8
7 A1AJOPLDYWFVVP
5 A39AQ83ZGIG3P7
5 A26N6ZLBVTZD1G
6 A1HT4XALGS65QM
6 A2VDSIII404UTO
9 APEDWC7RSTQ9K
8 A9VCKMMPAVAPE
5 A2FR6EDJGP2YSW
5 A1N1VXHVHA9RNC
18 A24DD13939UP52
7 AIAU1UOFGCV1M
6 ASXL2YF7KKC1U
5 A3OA3J34GYT3U9
7 AGTJ7NWJ1DLGC
6 ADFH2F8LPVXKQ
11 A158B3P7SYRQCX
8 A10KHX41ONY4U1
6 ALF06F901TFZJ
7 A96R3VO408QRF
10 A2MXSNFTTCZU4Z
5 A1POHHS62AWZCC
13 A2LNW7GRQ205KS
11 A3APW42N5MRVWT
8 A38XAAS5U05GY8
6 A272O9I5JS1T2Q
5 A132RY3ORH0HD2
8 A2EAX3QSDP8PKU
10 A101PLUE57KUSS
5 A2GN8KZ2JVVFTU
12 A2SZ4Q10WWZZO7
7 A107PT5KETBET7
5 A12GEZC5KTDEJ2
6 A1ZZYVEZOZKJER
6 ACOI2LBXVDS9P
5 A3LHYNHB9B2XHB
5 A1H4CM2169AVJ8
6 A1MVDL4U8E0SVE
7 AGDXZ4OA0YFLM
10 A27SLBKWLTG320
6 ARFPGL7OEATYH
9 AU1OMJ50X7E2S
7 AWN2NEDTVD2PC
6 AG82OGDW8MP6E
24 A1IWR4YH4ZA9BM
9 ARADEBLBT3P17
9 A1A6ADR8JG12KQ
5 A1TJNVXEHCANNT
6 A36VUEF14T5PDW
6 A1IZNYNXFN68R
6 A25

KeyboardInterrupt: 

In [None]:
np.random.seed(1)
msk1 = np.random.rand(len(data)) < 0.6
train = data[msk1].copy()
val = data[~msk1].copy()
# msk2 = np.random.rand(len(val)) < 0.5
# test = val[msk2].copy()
# val = val[~msk2].copy()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids.
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [None]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids.
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["User_ID", "Food_ID"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [None]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [None]:
df_train = encode_data(train)
df_val = encode_data(val, train)
msk = np.random.rand(len(df_val)) < 0.5
df_test = df_val[msk].copy()
df_val = df_val[~msk].copy()

In [None]:
num_users = len(df_train['User_ID'].unique())
num_items = len(df_train['Food_ID'].unique())
print(num_users, num_items)

5106 1681


In [None]:
model = MF(num_users, num_items, emb_size=5)

In [None]:
def val_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val['User_ID'].values)
    items = torch.LongTensor(df_val['Food_ID'].values)
    ratings = torch.FloatTensor(df_val['Rating'].values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    mse = F.mse_loss(y_hat, ratings)
    mae = F.l1_loss(y_hat, ratings)
    # print(np.sqrt(mse.item()))
    print(mae.item())
    # print("test loss %.3f " % loss.item())

In [None]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_test['User_ID'].values)
    items = torch.LongTensor(df_test['Food_ID'].values)
    ratings = torch.FloatTensor(df_test['Rating'].values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    mse = F.mse_loss(y_hat, ratings)
    mae = F.l1_loss(y_hat, ratings)
    # print(np.sqrt(mse.item()))
    print(mae.item())
    print(mse.item())
    # print("test loss %.3f " % loss.item())

In [None]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        optimizer.zero_grad()
        users = torch.LongTensor(df_train['User_ID'].values)
        items = torch.LongTensor(df_train['Food_ID'].values)
        ratings = torch.FloatTensor(df_train['Rating'].values)
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        #print(y_hat)
        #print(ratings)
        loss = F.mse_loss(y_hat, ratings)
        loss.backward()
        optimizer.step()
        # print(loss.item())

        # validation

        val_loss(model, unsqueeze)
        test_loss(model, unsqueeze)
        print()

In [None]:
model = MF(num_users, num_items, emb_size=100)
train_epocs(model, epochs=110, lr=0.1)

2.700651168823242
2.6969947814941406

0.8571066856384277
0.8540331721305847

2.742307662963867
2.7281346321105957

1.4896456003189087
1.4582854509353638

1.0823324918746948
1.070907711982727

1.7916425466537476
1.7745165824890137

2.0552749633789062
2.032470464706421

1.903079867362976
1.8734198808670044

1.4946757555007935
1.4585639238357544

1.0784873962402344
1.0493178367614746

1.0853033065795898
1.0740673542022705

1.3930280208587646
1.3920294046401978

1.34681236743927
1.3357784748077393

1.0548574924468994
1.0382190942764282

0.9061347246170044
0.8981481194496155

1.0863710641860962
1.079506516456604

1.257764458656311
1.2414113283157349

1.2580162286758423
1.2346066236495972

1.1206719875335693
1.0946820974349976

0.9419946670532227
0.9167416095733643

0.8602457642555237
0.8347537517547607

0.9435064196586609
0.9156597256660461

0.9951289296150208
0.9682302474975586

0.9411377906799316
0.9191439747810364

0.885248601436615
0.8673357367515564

0.9214827418327332
0.90500199794769

In [None]:
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs(model, epochs=100, lr=0.05, wd=1e-5)

4.125987529754639
4.121501922607422

3.986847400665283
3.982311487197876

3.8269107341766357
3.822309732437134

3.6454734802246094
3.64078688621521

3.4421579837799072
3.4373810291290283

3.2214643955230713
3.2171216011047363

2.9966514110565186
2.9939687252044678

2.753924608230591
2.753016710281372

2.492770195007324
2.493701696395874

2.2320728302001953
2.233727216720581

1.9663994312286377
1.968217134475708

1.695819616317749
1.6979426145553589

1.4646292924880981
1.4671330451965332

1.2475605010986328
1.2508138418197632

1.086032509803772
1.0882575511932373

0.9871129393577576
0.9854877591133118

0.9461660385131836
0.9426182508468628

1.0114212036132812
1.0091415643692017

1.0773558616638184
1.0764576196670532

1.1287895441055298
1.129119873046875

1.163756251335144
1.164839267730713

1.1836249828338623
1.1849122047424316

1.1911319494247437
1.1926125288009644

1.1910333633422852
1.1924147605895996

1.1870783567428589
1.1879541873931885

1.179603934288025
1.1801929473876953

1.167