# 0. Imports and predefines

In [None]:
BRANCH="new_classifier_heads"

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("WANDB_KEY") 
wandb.login(key=wandb_api)

# GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
# GITHUB_USER = user_secrets.get_secret("GITHUB_USER")
GITHUB_HOST = user_secrets.get_secret("GITHUB_HOST")
CLONE_URL = f"https://github.com/{GITHUB_HOST}/nlp_project_2023"
get_ipython().system(f"git clone {CLONE_URL}")

In [None]:
cd nlp_project_2023/

In [None]:
!git checkout $BRANCH

In [None]:
!pip install -r requirements.txt

In [None]:
cd ..

In [None]:
import sys
sys.path.append("nlp_project_2023/src")

In [None]:
import os
import random
from dataclasses import asdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import transformers

In [None]:
# Hardware
num_workers = 2
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# seed all
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

g = torch.Generator()
g.manual_seed(SEED)

def seed_dataloader_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1. Prepare data

In [None]:
from data import ParaphraseDataset, PawsParaphraseDataset

In [None]:
ds = PawsParaphraseDataset("/kaggle/input/the-paws-dataset-for-paraphrase-identification")
ds.add_train_set("labeled_final_train.csv")
ds.add_train_set("labeled_swap_train.csv")
# ds.add_train_set("unlabeled_final_train.csv")

ds.add_val_set("labeled_final_validation.csv")
# ds.add_val_set("unlabeled_final_validation.csv")

ds.add_test_set("labeled_final_test.csv")

ds.compile_dataset()

In [None]:
ds.train_df.shape, ds.val_df.shape, ds.test_df.shape

In [None]:
from data import build_tokenizer, PairedSentenceDataset


tokenizer = build_tokenizer("microsoft/deberta-v3-large")

dataset = PairedSentenceDataset(ds.train_df, tokenizer, 128)

assert dataset[0]["labels"].shape == (1, )
assert dataset[0]["input_ids"].shape == (1, 128)
assert dataset[0]

dataset = PairedSentenceDataset(ds.train_df, tokenizer, 20)

assert dataset[0]["input_ids"].shape == (1, 20)

# 2. Experiment

In [None]:
from train import TrainConfig, Trainer, DummyLogger, WandbLogger


config = TrainConfig(
    model="microsoft/deberta-v3-large",
    checkpoints_folder="./init_exp",
    batch_size=8,
    epochs=3,
    max_length=100,
    lr=6e-6,
    device=str(device),
    others=dict()
)

In [None]:
tokenizer = build_tokenizer(config.model)

train_loader = torch.utils.data.DataLoader(PairedSentenceDataset(ds.train_df, tokenizer, config.max_length),
                                           batch_size=config.batch_size, shuffle=True,
                                           num_workers=num_workers,
                                           worker_init_fn=seed_dataloader_worker,
                                           generator=g)

val_loader = torch.utils.data.DataLoader(PairedSentenceDataset(ds.val_df, tokenizer, config.max_length),
                                         batch_size=config.batch_size, shuffle=False,
                                         num_workers=num_workers,
                                         worker_init_fn=seed_dataloader_worker,
                                         generator=g)

test_loader = torch.utils.data.DataLoader(PairedSentenceDataset(ds.test_df, tokenizer, config.max_length),
                                          batch_size=config.batch_size, shuffle=False,
                                          num_workers=num_workers,
                                          worker_init_fn=seed_dataloader_worker,
                                          generator=g)

In [None]:
from models import DebertaV2WithCustomClassifier, MeanMaxPooler

model = DebertaV2WithCustomClassifier(
    transformers.DebertaV2Model.from_pretrained(config.model),
    MeanMaxPooler(1024, 256, [1024 + 2 * 256, 300, 60, 2])
)
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

In [None]:
logger = WandbLogger(project="nlp_project_2023", experiment_config=asdict(config))

trainer = Trainer(model, optimizer, logger)

trainer.train(train_loader, val_loader, config, test_loader)

logger.finish()