# Select Model

In [None]:
from __future__ import annotations

class CFG:
    quantized = False
    bit_4 = True
    bit_8 = False
    TARGET_MODEL = "microsoft/deberta-v3-large"

In [None]:
from pathlib import Path

OUTPUT_DIR = Path("/")
INPUT_DIR = Path("/")

# Import data

In [None]:
import pandas as pd
train = pd.read_csv(str(INPUT_DIR)+'/train.csv')

In [None]:
print(train.shape)
display(train.head())
print(train.label.value_counts())

# Preproccessing

In [None]:
train = train[['label', 'text']]

In [None]:
train.head()

# Modeling

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = train.loc[:, train.columns != "label"]
y = train.loc[:, train.columns == "label"]

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    train.loc[valid_index, "fold"] = i
    
print(train.groupby("fold")["label"].value_counts())
train.head()

In [None]:
val = train[train["fold"] == 0]
train = train[train["fold"] != 0]

In [None]:
train_df = train[["text_parsed", "label"]]
valid_df = val[["text_parsed", "label"]]

print(train_df.shape)
print(train_df.label.value_counts())
print(valid_df.shape)
print(valid_df.label.value_counts())

In [None]:
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U sentencepiece 
!pip install torch
!pip install datasets

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch

if CFG.quantized:

    peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        target_modules=[
            "q_proj",
            "v_proj"
        ],
    )

    if CFG.bit_4:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    if CFG.bit_8:
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_quant_type="nf8",
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.float16
        )

In [None]:
from transformers import AutoTokenizer, LlamaForSequenceClassification, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
if CFG.quantized:
    base_model = LlamaForSequenceClassification.from_pretrained(
        TARGET_MODEL,
        num_labels=5,
        quantization_config=bnb_config,   #TO QUANTIZE
        device_map={"":0}
    )
    base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id

if not CFG.quantized:
    base_model = AutoModelForSequenceClassification.from_pretrained(
        TARGET_MODEL,
        num_labels=5,
        device_map={"":0}
    )
    base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
if CFG.quantized:
    model = get_peft_model(base_model, peft_config)

if not CFG.quantized:   
    model = base_model

In [None]:
if CFG.quantized:
    model.print_trainable_parameters()

In [None]:
print(train_df.shape)
print(valid_df.shape)

In [None]:
print(train_df.label.value_counts(), valid_df.label.value_counts())

In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

In [None]:
def preprocess_function(examples, max_length=1024):   
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding=True)

In [None]:
train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(np.array(predictions), axis=1)

    rmse = np.sqrt(mean_squared_error(labels, predictions))
    
    return {
        "rmse": rmse,
    }

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

steps = 25

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    max_grad_norm=0.3,
    optim='paged_adamw_32bit',
    lr_scheduler_type="cosine",
    num_train_epochs=10,
    weight_decay=1e-5,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=False,
    warmup_ratio=0.1,
    eval_steps=steps,
    logging_steps=steps,
    report_to='none'
)

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_ds,
    eval_dataset=valid_tokenized_ds,
    tokenizer=tokenizer,
    callbacks=[early_stopping],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
from shutil import rmtree

trainer.save_model(output_dir=str(OUTPUT_DIR))

for path in Path(training_args.output_dir).glob("checkpoint-*"):
    if path.is_dir():
        rmtree(path)

In [None]:
del trainer, model, base_model

In [None]:
# cuda cache clear
import torch
torch.cuda.empty_cache()

# Test loading model

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch

if CFG.quantized:
    if CFG.bit_4:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    if CFG.bit_8:
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_quant_type="nf8",
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.float16
        )

In [None]:
from transformers import AutoTokenizer, LlamaForSequenceClassification, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
if CFG.quantized:
    base_model = LlamaForSequenceClassification.from_pretrained(
        TARGET_MODEL,
        num_labels=5,
        quantization_config=bnb_config,   #TO QUANTIZE
        device_map={"":0}
    )
    base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id

if not quantized:
    base_model = AutoModelForSequenceClassification.from_pretrained(
        TARGET_MODEL,
        num_labels=5,
        device_map={"":0}
    )
    base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
if CFG.quantized:
    model = get_peft_model(base_model, peft_config)

if not CFG.quantized:   
    model = base_model

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
pred_output = trainer.predict(valid_tokenized_ds)
logits = pred_output.predictions
logits

In [None]:
probs = np.argmax(logits, axis=1)

probs

In [None]:
sub = valid_df.copy()
sub['predictions'] = probs
sub.to_csv('submission.csv', index=False)
sub.head(5)