In [46]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
import sys, os

In [48]:
file_path = '/content/drive/MyDrive/kaggle_lstm/notebook/'
sys.path.append(file_path)
os.chdir(file_path)

In [49]:
!pip install pytorch-crf



In [50]:
from data_layer import DataParsing
from data_loader import PIIDataset, CustomCollateFn
from labels_to_data_bilstm_crf import convert_to_labels
from misc import build_vocab, f5_score_mapping
from bilstm_crf import BiLSTM_CRF
from early_stop import EarlyStopping
from train_text import train, evaluate, predict



In [51]:


import pandas as pd
import json, pdb

from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader

import torch.optim as optim
from sklearn.metrics import fbeta_score

In [52]:
cuda_available = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

# TODO move to config files,
# Separate according to process

chunk_size = 400
overlap = 2
test_size = 0.2
random_state = 42
batch_size = 16
train_path = '/content/drive/MyDrive/kaggle_lstm/data/train.json'
test_path = '/content/drive/MyDrive/kaggle_lstm/data/test.json'
embedding_dim = 128
hidden_dim = 256
pad_token_index = 0
pre_processing_chunk_size = 5000
num_layers = 1
dropout_rate = 0.02
num_epochs = 20
learning_rate = 0.003
patience = 5
verbose = True
beta = 5
subset_train = f'{file_path}/train_set.csv'


Using device: cpu


In [53]:

# f'/content/drive/MyDrive/kaggle_lstm/data/data/train.json'

In [54]:
def load_data(p):

    focus_lst = (pd.read_csv(subset_train)
    .loc[:, 'doc_id']
    .to_list()
    )


    df = (pd.read_json(p)
        .query("document.isin(@focus_lst)")
        .reset_index()
         .reset_index()
          )
    return df



In [55]:
def load_test(p):
    df = (pd.read_json(p)
          .reset_index()
          )
    return df


# New Section

In [56]:


df_train = load_data(train_path)
df_test = load_test(test_path)

dp = DataParsing(device=device,
                  batch_size=batch_size,
                  chunk_size=chunk_size,
                  overlap=overlap)

x_train, x_val = train_test_split(df_train,
                                  test_size=test_size,
                                  random_state=random_state
                                  )

x_train = dp.fit_transform(x_train)
x_val = dp.fit_transform(x_val)
x_test = dp.fit_transform(df_test)


# pdb.set_trace()
vocab_tokens = build_vocab(x_train.tokens.to_list())
labels_tokens = build_vocab(x_train.labels.to_list())
pos_tokens = build_vocab(x_train.pos.to_list())

vocab_size = len([i for i in vocab_tokens.keys()])
pos_vocab_size = len([i for i in pos_tokens.keys()])
num_tags = len([i for i in labels_tokens.keys()])


Function 'fit_transform' took: 1.568
Function 'fit_transform' took: 0.419
Function 'fit_transform' took: 0.034


In [57]:


# pdb.set_trace()
model = BiLSTM_CRF(vocab_size=vocab_size,
                    pos_vocab_size=pos_vocab_size,
                    embedding_dim=embedding_dim,
                    hidden_dim=hidden_dim,
                    num_tags=num_tags,
                    num_layers=num_layers,
                    pad_token_index=pad_token_index,
                    dropout_rate=dropout_rate,
                    ).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
best_val_loss = float("inf")

x_train_t = PIIDataset(x_train, word_to_idx=vocab_tokens, pos_to_idx=pos_tokens, label_to_idx=labels_tokens)
x_val_t = PIIDataset(x_val, word_to_idx=vocab_tokens, pos_to_idx=pos_tokens, label_to_idx=labels_tokens)

# todo robin test
x_test_t = PIIDataset(x_test, word_to_idx=vocab_tokens, pos_to_idx=pos_tokens)


collate_fn = CustomCollateFn(chunk_size=chunk_size, word_to_idx=vocab_tokens, pos_to_idx=pos_tokens,
                              label_to_idx=labels_tokens)
train_loader = DataLoader(x_train_t, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
valid_loader = DataLoader(x_val_t, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

predict_loader = DataLoader(x_test_t, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

early_stopping = EarlyStopping(patience=patience, verbose=verbose)

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, device)
    val_loss = evaluate(model, valid_loader, device)
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss:.4f}")

    early_stopping(val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping")
        break

# Predictions
# load model crude implementations


Epoch 1, Train Loss: 423.1606
Validation loss decreased (68.354185 --> 68.354185). Saving model ...
Epoch 2, Train Loss: 43.9337
Validation loss decreased (68.354185 --> 41.930894). Saving model ...
Epoch 3, Train Loss: 25.0797
Validation loss decreased (41.930894 --> 34.511314). Saving model ...
Epoch 4, Train Loss: 16.7847
Validation loss decreased (34.511314 --> 29.759842). Saving model ...
Epoch 5, Train Loss: 10.9581
EarlyStopping counter: 1 out of 5
Epoch 6, Train Loss: 7.1637
Validation loss decreased (29.759842 --> 29.523580). Saving model ...
Epoch 7, Train Loss: 5.0295
EarlyStopping counter: 1 out of 5
Epoch 8, Train Loss: 3.6188
EarlyStopping counter: 2 out of 5
Epoch 9, Train Loss: 2.6575
EarlyStopping counter: 3 out of 5
Epoch 10, Train Loss: 1.9925
EarlyStopping counter: 4 out of 5
Epoch 11, Train Loss: 1.6757
EarlyStopping counter: 5 out of 5
Early stopping


In [58]:

checkpoint = torch.load(f"{file_path}/checkpoint.pt")
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()
predictions = []
with torch.no_grad():
    for tokens, pos_tags in predict_loader:
        tokens, pos_tags = tokens.to(device), pos_tags.to(device)
        predictions.append(model(tokens, pos_tags))

raw_predictions = convert_to_labels(x_test, predictions, labels_tokens, pre_processing_chunk_size)

# to remove


Function 'convert_to_labels' took: 0.02


In [59]:
(
raw_predictions
  .drop(columns = 'words')
  .query('label != "O"')
  .reset_index(drop = True)
  .reset_index()
  .rename(columns = {'index': 'row_id',
                     'tokens': 'token'
                    })
  .to_csv(f'{file_path}/submission.csv', index=False)
)


