In [1]:
import numpy as np
from atel.data import BookCollection
from data_clean import set_seed
from lstm_model import lstm_data, lstm_text
import torch
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torchmetrics.functional import accuracy
from torchmetrics.functional.classification import multilabel_accuracy
import fasttext
from data_clean import *
import warnings
warnings.filterwarnings("ignore", ".*does not have many workers.*")

In [2]:
import yaml
from yaml import CLoader
with open('target_problem_type.yaml', 'r', encoding='utf-8') as f:
    test = yaml.load(f, Loader=CLoader)

In [3]:
test

{'Genre': 'multilabel',
 'Tekstbånd': 'multilabel',
 'Fremstillingsform': 'multilabel',
 'Semantisk univers': 'multilabel',
 'Stemmer': 'multilabel',
 'Perspektiv': 'multiclass',
 'Holistisk vurdering': 'multiclass'}

In [4]:
SEED = 42
NUM_EPOCHS = 3

In [5]:
## Load the data
book_col = BookCollection(data_file="./data/book_col_271120.pkl")

Loaded from disk: ./data/book_col_271120.pkl


In [6]:
## Load fastText model
# https://fasttext.cc/docs/en/crawl-vectors.html
print('Loading fastText model...')
ft = fasttext.load_model('fasttext_model/cc.da.300.bin')  # Download from fastTexts website
print('Loading complete!')

Loading fastText model...
Loading complete!




In [22]:
settings = {
    'multi_label': True,
    'n_features': 300, 
    "hidden_size": 128,
    "num_layers": 4,
    'l1_size': 512,
    'l2_size': 256,
    "dropout": 0.2,
    "batch_size": 64,
    "learning_rate" : 1e-5,
    "output_size": 5
}

num_folds = 10
results1 = []
results2 = []
target_col = 'Semantisk univers'

In [23]:
k=0
set_seed(SEED)

model = lstm_text(**settings)
data = lstm_data(
    book_col=book_col, 
    target_col=target_col, 
    ft=ft, 
    batch_size=settings['batch_size'], 
    seq_len=128,
    seed=SEED,
    k=k
)

Seed has been set to 42
Set to multi label classification


In [24]:
data.prepare_data()
data.setup()

In [25]:
book_colsample = next(iter(data.train_dataloader()))

In [26]:
book_ids, texts = clean_book_collection_texts(book_col, lowercase=False)

In [27]:
book_ids, X = get_fasttext_embeddings(book_col, ft, 128)

In [28]:
target_ids, targets, labels = get_labels(book_col, target_col)

mask = torch.isin(torch.from_numpy(target_ids), torch.from_numpy(book_ids))
y = torch.from_numpy(targets[mask]).float()

In [29]:
import torch
sig = torch.nn.Sigmoid()

In [43]:
def compute_metrics(preds, targets, logit_func, multi_label, current='train'):
    """ Function that compute relevant metrics to log """
    if multi_label:
        preds = logit_func(preds)
        acc_micro = accuracy(preds, targets, subset_accuracy=True)
        acc_macro = multilabel_accuracy(preds, targets, num_labels=targets.shape[1])

        metrics = {
            f'{current}_step_acc_micro': acc_micro,
            f'{current}_step_acc_macro': acc_macro
        }
    else:
        preds = logit_func(preds)
        acc = accuracy(preds, targets)

        metrics = {f'{current}_acc_step': acc}

    return metrics

In [44]:
preds = model(book_colsample[0])
y = book_colsample[1].int()

In [45]:
preds.shape

torch.Size([64, 5])

In [46]:
compute_metrics(preds, y, sig, True, 'val')

{'val_step_acc_micro': tensor(0.), 'val_step_acc_macro': tensor(0.2813)}

In [17]:
logger = pl.loggers.TensorBoardLogger(save_dir='lightning_logs', name=f'{target_col}-cv{k}-max_epoch_{NUM_EPOCHS}')

trainer = Trainer(
    max_epochs = NUM_EPOCHS,
    gpus = 1 if torch.cuda.is_available() else 0,
    log_every_n_steps = 1,
    logger = None
)
trainer.fit(model, data)

val_scores = trainer.validate(model, data)[0]

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | lstm       | LSTM              | 38.5 M
1 | dropout    | Dropout           | 0     
2 | l1         | Linear            | 2.1 M 
3 | out_layer  | Linear            | 10.8 K
4 | loss_func  | BCEWithLogitsLoss | 0     
5 | accuracy   | Accuracy          | 0     
6 | logit_func | Sigmoid           | 0     
-------------------------------------------------
40.6 M    Trainable params
0         Non-trainable params
40.6 M    Total params
162.443   Total estimated model params size (MB)


Epoch 0:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 59/66 [00:29<00:03,  1.97it/s, loss=0.689]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                                                               | 0/7 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                                                                  | 0/7 [00:00<?, ?it/s][A
Epoch 0:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 60/66 [00:30<00:03,  1.99it/s, loss=0.689][A
Epoch 0:  92%|███████████████████████████████████████████████████████████████████████████████████

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:34<00:00,  1.93it/s, loss=0.205]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  4.27it/s]
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       avg_val_acc          0.01190476305782795
      avg_val_loss          0.1966143101453781
      val_acc_step         0.012820512987673283
      val_loss_step         0.19722263514995575
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────