# 0. Imports and predefines

In [None]:
!pip install -r ../requirements.txt

In [None]:
import os
import sys
import random
import warnings
from pathlib import Path
from dataclasses import dataclass, asdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from tqdm.notebook import tqdm

import torch

import transformers

sys.path.append("../src")
from train import TrainConfig, Trainer, DummyLogger, WandbLogger
from data import PairedSentenceDataset, build_tokenizer
from models import DebertaV2WithCustomClassifier, PerceptronPoooler

%load_ext autoreload
%autoreload 2

In [None]:
# Hardware

num_workers = 2

device = torch.device("cpu")
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

In [None]:
# seed all

SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

g = torch.Generator()
g.manual_seed(SEED)

def seed_dataloader_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
# Log in to your W&B account
import wandb
wandb.login()

# 1. Prepare data

In [None]:
def read_tsv(path):
    return pd.read_csv(path, delimiter="\t", index_col=0)


final_folder = Path("../data/final")

final_train = read_tsv(final_folder / "train.tsv")
final_val = read_tsv(final_folder / "dev.tsv")
final_test = read_tsv(final_folder / "test.tsv")


swap_train = read_tsv("../data/swap/train.tsv")


unlabeled_train = read_tsv("../data/unlabeled/final/train.tsv")
unlabeled_val = read_tsv("../data/unlabeled/final/dev.tsv")

In [None]:
# print some info

for df in [final_train, final_val, final_test, swap_train, unlabeled_train, unlabeled_val]:
    print("+" * 30)

    print(df.info())

In [None]:
# merge into train, val and test

train_df = pd.concat([final_train, swap_train])

val_df = pd.concat([final_val])

test_df = pd.concat([final_test])


print("Train:", train_df.shape)

print("Val:", val_df.shape)

print("Test:", test_df.shape)

In [None]:
train_df.head(5)

In [None]:
tokenizer = build_tokenizer("microsoft/deberta-v3-large")

dataset = PairedSentenceDataset(train_df, tokenizer, 128)

assert dataset[0]["labels"].shape == (1, )
assert dataset[0]["input_ids"].shape == (1, 128)
assert dataset[0]

dataset = PairedSentenceDataset(train_df, tokenizer, 20)

assert dataset[0]["input_ids"].shape == (1, 20)

# 2. Experiment

In [None]:
config = TrainConfig(
    model="microsoft/deberta-v3-small",
    checkpoints_folder="/home/smt/Documents/10_term/nlp/nlp_project_2023/experiments/test_exp/",
    batch_size=2,
    epochs=3,
    max_length=32,
    lr=6e-6,
    device=str(device)
)

In [None]:
tokenizer = build_tokenizer(config.model)

train_loader = torch.utils.data.DataLoader(PairedSentenceDataset(train_df, tokenizer, config.max_length),
                                           batch_size=config.batch_size, shuffle=True,
                                           num_workers=num_workers,
                                           worker_init_fn=seed_dataloader_worker,
                                           generator=g)

val_loader = torch.utils.data.DataLoader(PairedSentenceDataset(val_df, tokenizer, config.max_length),
                                         batch_size=config.batch_size, shuffle=False,
                                         num_workers=num_workers,
                                         worker_init_fn=seed_dataloader_worker,
                                         generator=g)

test_loader = torch.utils.data.DataLoader(PairedSentenceDataset(test_df, tokenizer, config.max_length),
                                          batch_size=config.batch_size, shuffle=False,
                                          num_workers=num_workers,
                                          worker_init_fn=seed_dataloader_worker,
                                          generator=g)

In [None]:


model = DebertaV2WithCustomClassifier(
    transformers.DebertaV2Model.from_pretrained(config.model),
    PerceptronPoooler([768, 2])
).to(device)

# model = transformers.DebertaV2ForSequenceClassification.from_pretrained(config.model)

model

In [None]:
# logger = WandbLogger(project="nlp_project_2023", experiment_config=config)
logger = DummyLogger()

optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

trainer = Trainer(model, optimizer, logger)

trainer.train(train_loader, val_loader, config)

logger.finish()

In [None]:
trainer.make_inference(val_loader)