In [1]:
import numpy as np
from atel.data import BookCollection
from data_clean import set_seed
from lstm_model import lstm_data, lstm_text
import torch
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import fasttext
from data_clean import *
import warnings
warnings.filterwarnings("ignore", ".*does not have many workers.*")

In [5]:
target_ids, targets, labels = get_labels(book_col, 'Semantisk univers')

In [13]:
targets.shape

(803, 5)

In [17]:
(targets.sum()-targets.sum(axis=0))/targets.sum(axis=0)

array([ 3.14871795,  2.56387665,  2.39915966,  5.63114754, 28.96296296])

In [2]:
SEED = 42
NUM_EPOCHS = 3

In [3]:
## Load the data
book_col = BookCollection(data_file="./data/book_col_271120.pkl")

Loaded from disk: ./data/book_col_271120.pkl


In [4]:
## Load fastText model
# https://fasttext.cc/docs/en/crawl-vectors.html
print('Loading fastText model...')
ft = fasttext.load_model('fasttext_model/cc.da.300.bin')  # Download from fastTexts website
print('Loading complete!')

Loading fastText model...
Loading complete!




In [5]:
settings = {
    'multi_label': True,
    'n_features': 300, 
    "hidden_size": 256*8, 
    "num_layers": 1,
    "num_l1": 256*2,
    "dropout": 0.2, 
    "batch_size": 12,
    "learning_rate" : 1e-5,
    "output_size": 21
}

num_folds = 10
results1 = []
results2 = []
target_col = 'Semantisk univers'

In [6]:
k=0
set_seed(SEED)

model = lstm_text(**settings)
data = lstm_data(
    book_col=book_col, 
    target_col=target_col, 
    ft=ft, 
    batch_size=settings['batch_size'], 
    seq_len=128,
    seed=SEED,
    k=k
)

Seed has been set to 42




Set to multi label classification


In [7]:
data.prepare_data()
data.setup()

In [9]:
book_colsample = next(iter(data.train_dataloader()))

In [10]:
book_ids, texts = clean_book_collection_texts(book_col, lowercase=False)

In [10]:
book_ids, X = get_fasttext_embeddings(book_col, ft, 128)

In [13]:
target_ids, targets, labels = get_labels(book_col, target_col)

mask = torch.isin(torch.from_numpy(target_ids), torch.from_numpy(book_ids))
y = torch.from_numpy(targets[mask]).float()

In [16]:
y.mean(0)

tensor([0.0334, 0.0540, 0.0591, 0.0244, 0.0823, 0.1028, 0.0257, 0.0424, 0.0437,
        0.0476, 0.0913, 0.0450, 0.0733, 0.0090, 0.0578, 0.0244, 0.0103, 0.0296,
        0.0771, 0.0887, 0.0373])

In [11]:
import torch
sig = torch.nn.Sigmoid()

In [18]:
model(sample[0])

tensor([[-2.5022, -2.7648, -2.6698, -2.9924, -2.4533, -2.1383, -3.3950, -2.5631,
         -2.8031, -2.4818, -2.4137, -3.1163, -2.1912, -2.8551, -2.5993, -2.9476,
         -3.0418, -2.7840, -2.8872, -2.3980, -2.6375],
        [-2.5961, -2.8706, -2.6771, -3.0210, -2.4750, -2.0807, -3.3756, -2.6533,
         -2.8025, -2.4397, -2.3477, -3.2168, -2.1297, -2.8758, -2.6370, -3.0564,
         -3.0354, -2.7877, -2.8764, -2.2851, -2.6389],
        [-2.6682, -2.9379, -2.6840, -3.1808, -2.5222, -2.2368, -3.4130, -2.6321,
         -2.9120, -2.5936, -2.5371, -3.2419, -2.1935, -2.8805, -2.6802, -3.0794,
         -3.1508, -2.8439, -2.8957, -2.4152, -2.7835],
        [-2.5565, -2.8343, -2.6201, -3.0496, -2.3280, -2.1146, -3.3595, -2.5983,
         -2.8028, -2.4587, -2.4328, -3.2213, -2.1140, -2.8092, -2.6528, -2.9729,
         -3.0185, -2.7484, -2.8855, -2.2869, -2.7505],
        [-2.5656, -3.0054, -2.7398, -3.0929, -2.5719, -2.2291, -3.3409, -2.6259,
         -2.8108, -2.5301, -2.4169, -3.3062, -2.143

In [19]:
sig(model(sample[0]))

tensor([[0.0697, 0.0496, 0.0648, 0.0402, 0.0721, 0.0951, 0.0293, 0.0643, 0.0546,
         0.0761, 0.0737, 0.0320, 0.1001, 0.0486, 0.0619, 0.0439, 0.0389, 0.0502,
         0.0512, 0.0825, 0.0578],
        [0.0735, 0.0551, 0.0665, 0.0513, 0.0765, 0.1077, 0.0312, 0.0692, 0.0612,
         0.0850, 0.0849, 0.0373, 0.1082, 0.0538, 0.0680, 0.0459, 0.0453, 0.0599,
         0.0555, 0.0930, 0.0658],
        [0.0725, 0.0489, 0.0637, 0.0468, 0.0734, 0.1015, 0.0305, 0.0707, 0.0571,
         0.0753, 0.0815, 0.0404, 0.1047, 0.0519, 0.0634, 0.0465, 0.0409, 0.0575,
         0.0502, 0.0846, 0.0612],
        [0.0728, 0.0573, 0.0709, 0.0504, 0.0808, 0.1067, 0.0349, 0.0758, 0.0622,
         0.0786, 0.0935, 0.0411, 0.1024, 0.0556, 0.0678, 0.0520, 0.0463, 0.0674,
         0.0553, 0.0951, 0.0649],
        [0.0713, 0.0535, 0.0647, 0.0462, 0.0835, 0.1108, 0.0304, 0.0716, 0.0596,
         0.0783, 0.0780, 0.0376, 0.1063, 0.0551, 0.0665, 0.0459, 0.0392, 0.0591,
         0.0527, 0.0857, 0.0696],
        [0.0746, 0.0

In [14]:
sample[1]

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [15]:
sub1 = sample[0][0]

In [16]:
X.shape

torch.Size([778, 128, 300])

In [17]:
logger = pl.loggers.TensorBoardLogger(save_dir='lightning_logs', name=f'{target_col}-cv{k}-max_epoch_{NUM_EPOCHS}')

trainer = Trainer(
    max_epochs = NUM_EPOCHS,
    gpus = 1 if torch.cuda.is_available() else 0,
    log_every_n_steps = 1,
    logger = None
)
trainer.fit(model, data)

val_scores = trainer.validate(model, data)[0]

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | lstm       | LSTM              | 38.5 M
1 | dropout    | Dropout           | 0     
2 | l1         | Linear            | 2.1 M 
3 | out_layer  | Linear            | 10.8 K
4 | loss_func  | BCEWithLogitsLoss | 0     
5 | accuracy   | Accuracy          | 0     
6 | logit_func | Sigmoid           | 0     
-------------------------------------------------
40.6 M    Trainable params
0         Non-trainable params
40.6 M    Total params
162.443   Total estimated model params size (MB)


Epoch 0:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 59/66 [00:29<00:03,  1.97it/s, loss=0.689]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                                                               | 0/7 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                                                                  | 0/7 [00:00<?, ?it/s][A
Epoch 0:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 60/66 [00:30<00:03,  1.99it/s, loss=0.689][A
Epoch 0:  92%|███████████████████████████████████████████████████████████████████████████████████

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:34<00:00,  1.93it/s, loss=0.205]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  4.27it/s]
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       avg_val_acc          0.01190476305782795
      avg_val_loss          0.1966143101453781
      val_acc_step         0.012820512987673283
      val_loss_step         0.19722263514995575
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────