In [16]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from sklearn import metrics
from tqdm.notebook import tqdm

import mlflow

import csv
import json
import functools
import gzip
import numpy as np
import pandas as pd
import torch
import tempfile

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, balanced_accuracy_score

from gensim.models import KeyedVectors
from gensim.parsing import preprocessing
from torch.utils.data import Dataset, DataLoader, IterableDataset
from torchmetrics import AveragePrecision

from tqdm.notebook import tqdm, trange
import tempfile

In [17]:
torch.__version__

'1.10.1+cu111'

In [18]:
print(f"¿CUDA disponible? {torch.cuda.is_available()}")

¿CUDA disponible? True


## Exploración de datos
Cargamos los datos de entrenamiento para ver que forma tienen

In [19]:
# train_dataset = pd.read_json('./data/meli-challenge-2019/spanish.train.jsonl.gz',lines=True)
# train_dataset.head(20)

In [20]:
# train_dataset.language.value_counts()

In [21]:
# train_dataset.label_quality.value_counts()

## Dataset

Tomado de https://github.com/DiploDatos/AprendizajeProfundo/blob/master/3_datasets.ipynb


In [22]:
class MeLiChallengeDataset(IterableDataset):
    def __init__(self, path, transform=None, key = 'title'):
        """
        path: Ubicación a los datos (comprimidos con gzip)
        key: Columna que vamos a usar para entrenar
        """
        self.dataset_path = path
        self.transform = transform
        self.key = key

    def __iter__(self):
        """
        Habilita un iterador sobre los datos
        """
        with gzip.open(self.dataset_path, "rt") as fh:
            for l in fh:
                data = json.loads(l)
                item = {
                    "data": data[self.key],
                    "target": data['target']
                }
                
                if self.transform:
                    yield self.transform(item)
                else:
                    yield item


In [23]:
class PadSequences:
    def __init__(self, pad_value=0, max_length=None, min_length=1):
        assert max_length is None or min_length <= max_length
        self.pad_value = pad_value
        self.max_length = max_length
        self.min_length = min_length

    def __call__(self, items):
        data, target = list(zip(*[(item["data"], item["target"]) for item in items]))
        seq_lengths = [len(d) for d in data]
        

        if self.max_length:
            max_length = self.max_length
            seq_lengths = [min(self.max_length, l) for l in seq_lengths]
        else:
            max_length = max(self.min_length, max(seq_lengths))

        data = [d[:l] + [self.pad_value] * (max_length - l)
                for d, l in zip(data, seq_lengths)]
            
        return {
#            "data": torch.LongTensor(data),
            "data": torch.FloatTensor(data), 
            "target": torch.FloatTensor(target)
        }

In [24]:
# Definimos una collate_fn que nos retorne todos los arreglos de la misma longitud
# data_len = train_dataset.data.apply(lambda v: len(v))
# max_len = data_len.max()

pad_to_len = PadSequences(max_length=20)

### Juntando todo
Tenemos
* Una clase con la responsabilidad de entregar datos, potencialmente preprocesandolos si hace falta
* Una función que transforma los datos leidos para que todos los valores tengan la misma longitud

A partir de esto, creamos dos instancia del `DataLoader`: Uno para cargar los datos de entrenamiento y otro para cargar los datos de _test_

In [25]:
train_dataset = MeLiChallengeDataset('./data/meli-challenge-2019/spanish.train.jsonl.gz', key = 'data')
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn = pad_to_len)

In [26]:
test_dataset = MeLiChallengeDataset('./data/meli-challenge-2019/spanish.validation.jsonl.gz', key = 'data') 
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn = pad_to_len)

## MLP

Tomamos el MLP de las clases y agregamos los parametros que nos interesan

* Capa de entrada: Tantas neuronas como tokens
* Capa de salida: Tantas neuronas como categorías (632)

In [27]:
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, input_size, output_size, hidden = []):
        """
        input_size: Número de neuronas de entrada
        output_size: Número de neuronas de salida
        hidden: Lista con los numeros de capas ocultas
        """
        super().__init__()
        
        assert len(hidden) > 0
        
        self._name = str(input_size) +'_'+ "_".join(map(lambda i: str(i), hidden)) + '_' + str(output_size)
        
        neurons = [input_size]  + hidden 
        parts = []
        for idx, each in enumerate(neurons[:-1]):
            parts.append(nn.Linear(each, neurons[idx + 1]))
            parts.append(nn.ReLU())
            
        parts = parts + [nn.Linear(neurons[-1], 1), nn.Sigmoid()]
        
        self.model = nn.Sequential(*parts)
    
    def forward(self, x: torch.Tensor):
        return self.model(x)
    
    def name(self):
        return "MLP_" + self._name
    

In [28]:
MLP(10, 2 , hidden = [16]).name()

'MLP_10_16_2'

In [29]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [30]:
# mlflow.set_experiment("MLP Basico")

# with mlflow.start_run():
#     mlflow.log_param("model_name", "mlp") # TODO Log parameters
#     mlflow.log_param("epochs", "1")
#     mlflow.log_param("hidden_layer_1_neurons", "2")
    
#     model = MLP(20, 632, hidden= [2]) # 20 -> 2 -> 632
#     loss = nn.BCELoss()
#     optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    
# #     model.to(device)
#     for epoch in trange(1): # TODO Pruebas con varias epochs
#         model.train()
#         running_loss = []
#         for idx, batch in enumerate(tqdm(train_dataloader)):
#             optimizer.zero_grad()
            
# #             input_data = batch["data"].to(device)
#             input_data = batch["data"]
# #             output = model(batch["data"])
#             output = model(input_data)
    
# #             target_data = batch["target"].view(-1,1).to(device)
#             target_data = batch["target"].view(-1,1)
#             loss_value = loss(output, target_data)
#             loss_value.backward()
#             optimizer.step()
#             running_loss.append(loss_value.item())        
#         mlflow.log_metric("train_loss", sum(running_loss) / len(running_loss), epoch)
        
#         model.eval()
#         running_loss = []
#         targets = []
#         predictions = []
#         for batch in tqdm(test_dataloader):
#             output = model(batch["data"])
#             running_loss.append(
#                 loss(output, batch["target"].view(-1, 1)).item()
#             )
#             targets.extend(batch["target"].numpy())
#             predictions.extend(output.squeeze().detach().numpy())
#         mlflow.log_metric("test_loss", sum(running_loss) / len(running_loss), epoch)
#         # Reemplazamos la siguiente metrica por la de pythorch (La de scikit learn no acepta problemas multiclase)

#         mlflow.log_metric("test_avp", balanced_accuracy_score(targets, predictions), epoch)        

#     with tempfile.TemporaryDirectory() as tmpdirname:
#             targets = []
#             predictions = []
#             for batch in tqdm(test_dataloader):
#                 output = model(batch["data"])
#                 targets.extend(batch["target"].numpy())
#                 predictions.extend(output.squeeze().detach().numpy())
#             pd.DataFrame({"prediction": predictions, "target": targets}).to_csv(
#                 f"{tmpdirname}/predictions.csv.gz", index=False
#             )
#             mlflow.log_artifact(f"{tmpdirname}/predictions.csv.gz")
        
        


In [31]:
get_train_dataloader = lambda bs: DataLoader(train_dataset, batch_size=bs, shuffle=False, num_workers=0, collate_fn = pad_to_len)
get_test_dataloader = lambda bs: DataLoader(test_dataset, batch_size=bs, shuffle=False, num_workers=0, collate_fn = pad_to_len)

In [35]:
def run_experiment(params, **kwargs):
    """
    Run a single experiment

    params: A list of (model, loss_fn, optimizer,num_epochs) tuples
    """
    for(build_model, build_loss, build_optimizer, nepochs, batch_sizes) in params:

        # Pasamos varios valores para epochs, ejecutamos un experimento para cada uno
        for epochs in nepochs:
            for bs in batch_sizes:
                
                print(f"Experiment epochs:{epochs}, batch-size:{bs}")
                train_dataloader = get_train_dataloader(bs)
                test_dataloader = get_test_dataloader(bs)
                
                with mlflow.start_run():
                    # Crea nuevos objetos para cada epoca
                    model = build_model()
                    optimizer = build_optimizer(model)
                    loss = build_loss()

                    exp_name = model.name() + "_e" + str(epochs) + "_b" + str(bs)
                    mlflow.set_experiment(exp_name)

                    mlflow.log_param("model_name", model.name())
                    mlflow.log_param("epochs", str(epochs))
                    mlflow.log_param("batch_size", str(bs))

                    model#.to(device)
                    for epoch in trange(epochs):

                        model.train()
                        running_loss = []
                        for idx, batch in enumerate(tqdm(train_dataloader)):
                            optimizer.zero_grad()

                            input_data = batch["data"]#.to(device)
                            output = model(input_data)

                            target_data = batch["target"].view(-1,1)#.to(device)
                            loss_value = loss(output, target_data)
                            loss_value.backward()

                            optimizer.step()
                            running_loss.append(loss_value.item())
                        mlflow.log_metric("train_loss", sum(running_loss) / len(running_loss), epoch)

                        model.eval()
                        running_loss = []
                        targets = []
                        predictions = []
                        for batch in tqdm(test_dataloader):
                            output = model(batch["data"])
                            running_loss.append(
                                loss(output, batch["target"].view(-1, 1)).item()
                            )
                            targets.extend(batch["target"].numpy())
                            predictions.extend(output.squeeze().detach().numpy())

                        mlflow.log_metric("test_loss", sum(running_loss) / len(running_loss), epoch)
                        mlflow.log_metric("test_avp", balanced_accuracy_score(targets, predictions), epoch)

                  #
                  #
                    with tempfile.TemporaryDirectory() as tmpdirname:
                        targets = []
                        predictions = []

                        for batch in tqdm(test_dataloader):
                            output = model(batch["data"])
                            targets.extend(batch["target"].numpy())
                            predictions.extend(output.squeeze().detach().numpy())

                        filename = "{}/{}_predictions.csv.gz".format(tmpdirname, exp_name)
                        pd.DataFrame({"prediction": predictions, "target": targets}).to_csv(
                            filename, index=False
                        )
                        mlflow.log_artifact(filename)

In [None]:
loss = lambda : nn.BCELoss()

def getOptimizer(model, lr = 1e-3, wd = 1e-5):
    return optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)


                   
epochs = [20, 40, 60]
batch_sizes = [24, 48, 64]


mlp_single_layer = [(lambda : MLP(20, 632, hidden = [256]), loss, getOptimizer, epochs, batch_sizes)]
mlp_multiple_layers = [(lambda : MLP(20, 632, hidden = [40,64,256, 512]), loss, getOptimizer, epochs, batch_sizes)]

run_experiment(mlp_single_layer + mlp_multiple_layers)


Experiment epochs:20, batch-size:24


  0%|          | 0/20 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]