In [11]:
# This needs to be here because by default Jupyter only adds the pwd to sys.path
import os, sys
if os.path.abspath('..') not in sys.path: sys.path.append(os.path.abspath('..'))

import pandas as pd
import numpy as np

from pysrc.constants import cachepath, datapath, chartpath
import torch
import time
from torch.utils.data import DataLoader

In [2]:
train_dict = {}
test_dict = {}

with open(datapath("train.txt")) as file:
    for line in file:
        (user_id, items) = line.split(maxsplit=1)
        train_dict[user_id] = items

with open(datapath("test.txt")) as file:
    for line in file:
        split = line.split(maxsplit=1)
        if (len(split) > 1):
            (user_id, items) = split
            test_dict[user_id] = items
        else:
            test_dict[split[0]] = ''


In [3]:
train = []
test = []
for key in train_dict:
    train.append([int(n) for n in train_dict[key].replace('\n', '').split(' ')])

for key in test_dict:
    if len(test_dict[key]) == 0:
        test.append([])
    else:
        test.append([int(n) for n in test_dict[key].replace('\n', '').split(' ')])
        

In [4]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import Tensor


def log_likelihood_loss(y, yhat):
    return -torch.mean(torch.sum(torch.log(yhat) * y, axis = 1))


class VanillaVAE(nn.Module):


    def __init__(self,
                 input_dim: int,
                 latent_dim: int,
                 hidden_dims = None,
                 kl_weight = .2
                 ):
        super(VanillaVAE, self).__init__()
        
        self.kl_weight = kl_weight

        self.latent_dim = latent_dim
        
        modules = []
        
        if hidden_dims is None:
            hidden_dims = [512, 128]
            
        self.hidden_dims = hidden_dims
            
        modules.append(nn.Linear(input_dim, hidden_dims[0]))
        modules.append(nn.BatchNorm1d(hidden_dims[0]))
        modules.append(nn.LeakyReLU())
        
        # Build Encoder
        for i in range(len(hidden_dims)-1):
            modules.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            modules.append(nn.BatchNorm1d(hidden_dims[i+1]))
            modules.append(nn.LeakyReLU())

        self.encoder = nn.Sequential(*modules)
        self.fc_mu = nn.Linear(hidden_dims[-1], latent_dim)
        self.fc_var = nn.Linear(hidden_dims[-1], latent_dim)


        # Build Decoder
        modules = []

        self.decoder_input = nn.Linear(latent_dim, hidden_dims[-1])

        hidden_dims.reverse()

        for i in range(len(hidden_dims) - 1):
            modules.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            modules.append(nn.BatchNorm1d(hidden_dims[i+1]))
            modules.append(nn.LeakyReLU())



        self.decoder = nn.Sequential(*modules)

        self.final_layer = nn.Sequential(nn.Linear(hidden_dims[-1], input_dim),
                                         nn.Sigmoid()
                                        )
        

    def encode(self, input: Tensor):
        """
        Encodes the input by passing through the encoder network
        and returns the latent codes.
        :param input: (Tensor) Input tensor to encoder [N x C x H x W]
        :return: (Tensor) List of latent codes
        """
        result = self.encoder(input)
        result = torch.flatten(result, start_dim=1)

        # Split the result into mu and var components
        # of the latent Gaussian distribution
        mu = self.fc_mu(result)
        log_var = self.fc_var(result)

        return [mu, log_var]

    def decode(self, z: Tensor):
        """
        Maps the given latent codes
        onto the image space.
        :param z: (Tensor) [B x D]
        :return: (Tensor) [B x C x H x W]
        """
        result = self.decoder_input(z)
        result = result.view(-1, self.hidden_dims[0])
        result = self.decoder(result)
        result = self.final_layer(result)
        result = F.normalize(result, p=1)
        return result

    def reparameterize(self, mu: Tensor, logvar: Tensor) -> Tensor:
        """
        Reparameterization trick to sample from N(mu, var) from
        N(0,1).
        :param mu: (Tensor) Mean of the latent Gaussian [B x D]
        :param logvar: (Tensor) Standard deviation of the latent Gaussian [B x D]
        :return: (Tensor) [B x D]
        """
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return eps * std + mu

    def forward(self, x: Tensor, **kwargs):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        return  [self.decode(z), x, mu, log_var]

    def loss_function(self, recons, x, mu, log_var) -> dict:
        """
        Computes the VAE loss function.
        KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2}
        :param args:
        :param kwargs:
        :return:
        """
        

        kld_weight = 20
        recons_loss = log_likelihood_loss(x, recons)

        kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0)

        loss = recons_loss + kld_weight * kld_loss
        return {'loss': loss, 'Reconstruction_Loss':recons_loss.detach(), 'KLD':-kld_loss.detach()}

    def sample(self,
               num_samples:int,
               current_device: int, **kwargs) -> Tensor:
        """
        Samples from the latent space and return the corresponding
        image space map.
        :param num_samples: (Int) Number of samples
        :param current_device: (Int) Device to run the model
        :return: (Tensor)
        """
        z = torch.randn(num_samples,
                        self.latent_dim)

        z = z.to(current_device)

        samples = self.decode(z)
        return samples

    def generate(self, x: Tensor, **kwargs) -> Tensor:
        """
        Given an input image x, returns the reconstructed image
        :param x: (Tensor) [B x C x H x W]
        :return: (Tensor) [B x C x H x W]
        """

        return self.forward(x)[0]

In [5]:
item_dim = 91599
lr = 1e-3
model = VanillaVAE(input_dim = item_dim, latent_dim = 64, hidden_dims = [512, 128], kl_weight = 20)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

In [8]:
np.random.seed(0)
test_ids = np.random.choice(np.array(list(train_dict.keys())).astype(int), int(.2*len(train_dict.keys())), replace=False)

In [9]:
def sparsify_uniform(data, p_relative = .1):
    current_sparse = None
    batch_size = 1000
    i = 0
    X = []
    for i in data:
        X.append(torch.zeros(item_dim).bool())
        for item in i:
            if np.random.random() > p_relative:
                X[-1][item] = 1
        X[-1] = X[-1]
    X = torch.stack(X)
    return X
    
def sparsify_items(data, m, sigma):
    return

def sparsify_uers(data, m, sigma):
    return

def list_batch_to_ohe(data):
    return sparsify_uniform(data, 0)

dev_df = pd.Series(train).drop(test_ids)
test_df = pd.Series(test).iloc[test_ids]

train_df = dev_df.iloc[:int(.7*len(dev_df))]
val_df = dev_df.iloc[int(.7*len(dev_df)):]

train_tensor = list_batch_to_ohe(train_df)
val_tensor = list_batch_to_ohe(val_df)
test_tensor = list_batch_to_ohe(test_df)

In [24]:
train_tensor.shape

torch.Size([29480, 91599])

In [12]:
n_epochs = 10
batch_size = 128

train_dataloader = DataLoader(train_tensor, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_tensor, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_tensor, batch_size=batch_size, shuffle=True)

In [147]:
for epoch in range(n_epochs):
    print(f"Start Epoch {epoch}")
    np.random.shuffle(train_df)
    model.train()
    start_time = time.time()
    total_loading_time = 0
    for batch_X in train_dataloader:
        recons, x, mu, log_var = model.forward(batch_X)
        train_loss = model.loss_function(recons, x, mu, log_var)['loss']
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
    model.eval()
    total_train_loss = 0
    r_train_loss = 0
    kl_train_loss = 0
    train_batches = 0
    for batch_X in train_dataloader:
        recons, x, mu, log_var = model.forward(batch_X)
        loss = model.loss_function(recons, x, mu, log_var)
        total_train_loss += loss['loss']
        print(total_train_loss)
        r_train_loss += loss['Reconstruction_Loss']
        kl_train_loss += loss['KLD']
        train_batches += 1
        
    total_train_loss /= train_batches
    r_train_loss /= train_batches
    kl_train_loss /= train_batches
    
    total_loss = 0
    r_loss = 0
    kl_loss = 0
    batches = 0
    for i in range(int(len(val_df)/batch_size)):
        if i % 10==0:
            print("Val Eval", i)
        batch_val = val_df[i * batch_size: min(len(train_df)-1, (i+1) * batch_size)]
        batch_X = list_batch_to_ohe(batch_val)[0]
        recons, x, mu, log_var = model.forward(batch_X)
        loss = model.loss_function(recons, x, mu, log_var)
        total_loss += loss['loss']
        print(total_loss)
        r_loss += loss['Reconstruction_Loss']
        kl_loss += loss['KLD']
        batches += 1
    
    total_loss /= batches
    r_loss /= batches
    kl_loss /= batches
    
    print(f'Train loss: {total_train_loss}')
    print(f'Train r loss: {r_train_loss}')
    print(f'Train kl loss: {kl_train_loss}')
    
    print(f'Val loss: {total_loss}')
    print(f'Val r loss: {r_loss}')
    print(f'Val kl loss: {kl_loss}')
    

Start Epoch 0
tensor(859.6986, grad_fn=<AddBackward0>)
tensor(416.7498, grad_fn=<AddBackward0>)
tensor(416.3850, grad_fn=<AddBackward0>)
tensor(870.5687, grad_fn=<AddBackward0>)
tensor(1283.0436, grad_fn=<AddBackward0>)
tensor(1696.3805, grad_fn=<AddBackward0>)
Val Eval 0
tensor(774.0750, grad_fn=<AddBackward0>)
Train loss: 565.4601440429688
Train r loss: 556.51904296875
Train kl loss: -0.44705748558044434
Val loss: 774.074951171875
Val r loss: 772.856689453125
Val kl loss: -0.06091318279504776
Start Epoch 1
tensor(675.5333, grad_fn=<AddBackward0>)
tensor(504.4840, grad_fn=<AddBackward0>)
tensor(501.8503, grad_fn=<AddBackward0>)
tensor(684.7197, grad_fn=<AddBackward0>)
tensor(1189.4111, grad_fn=<AddBackward0>)
tensor(1690.3800, grad_fn=<AddBackward0>)
Val Eval 0
tensor(780.1489, grad_fn=<AddBackward0>)
Train loss: 563.4600219726562
Train r loss: 556.0555419921875
Train kl loss: -0.37022408843040466
Val loss: 780.14892578125
Val r loss: 779.2476806640625
Val kl loss: -0.0450621843338012

In [126]:
batch_train = train_df[i * batch_size: min(len(train_df)-1, (i+1) * batch_size)]
batch_X, load_time = list_batch_to_ohe(batch_train)
total_loading_time += load_time
recons, x, mu, log_var = model.forward(batch_X)
train_loss = model.loss_function(recons, x, mu, log_var)['loss']

In [128]:
model.loss_function(recons, x, mu, log_var)

{'loss': tensor(-449.1120, grad_fn=<AddBackward0>),
 'Reconstruction_Loss': tensor(-450.7499),
 'KLD': tensor(-8.1898)}

In [None]:
np.random.shuffle(train)
pd.Series(train)

In [15]:
items = set()
for key in train_dict:
    if int(key) % 1000 == 0:
        print(key)
    
    items = items.union(items, set(train_dict[key].replace('\n', '').split(' ')))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000


In [16]:
len(items)

91599

In [17]:
items

{'43979',
 '39065',
 '33205',
 '89792',
 '30523',
 '70033',
 '88878',
 '41347',
 '68344',
 '86949',
 '44103',
 '2951',
 '85171',
 '60160',
 '11377',
 '72298',
 '20399',
 '5311',
 '58242',
 '54536',
 '57125',
 '87479',
 '87537',
 '12518',
 '2327',
 '7306',
 '78055',
 '72185',
 '6584',
 '40171',
 '50978',
 '42484',
 '72646',
 '82726',
 '12963',
 '90733',
 '73561',
 '66889',
 '84097',
 '73863',
 '59218',
 '90208',
 '79641',
 '10330',
 '73087',
 '25511',
 '54405',
 '79081',
 '37905',
 '41290',
 '56935',
 '75752',
 '15011',
 '21282',
 '84148',
 '65214',
 '7170',
 '20539',
 '38053',
 '90501',
 '5057',
 '78085',
 '75398',
 '81791',
 '33515',
 '2132',
 '17192',
 '58625',
 '62372',
 '91146',
 '62674',
 '10106',
 '89645',
 '13462',
 '26523',
 '44934',
 '18107',
 '45152',
 '34809',
 '66897',
 '82924',
 '5304',
 '37547',
 '50489',
 '54758',
 '84401',
 '83636',
 '50854',
 '26391',
 '9954',
 '2949',
 '78515',
 '4512',
 '3057',
 '13435',
 '50656',
 '80658',
 '1741',
 '51356',
 '37561',
 '25240',
 '52