<span style="font-family: 'Arial'; font-size: 120%;">This notebook is part of the Kaggle learning competition on Natural Language Processing with Disaster Tweets. The aim of the notebook was to practice fine-tuning and implementing various methods to improve a single metric (F1 score in this case). The model achieved decent score of 0.84063.</span>

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import Dataset,DatasetDict
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments,Trainer
!pip install transformers datasets evaluate ray[tune] wandb
import os
import evaluate
import wandb



<span style="font-family: 'Arial'; font-size: 110%;">Setting three environment variables for Weights and Biases logging and monitoring.</span> 

In [None]:

os.environ["WANDB_PROJECT"]="kagglecomplargexxl"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"


In [None]:
path = "/notebooks/train.csv"



In [None]:
df = pd.read_csv(path)
df

<span style="font-family: 'Arial'; font-size: 110%;">The code attempts to clean the data in various ways, including converting text to lowercase and filling in missing values with the mode. However, i decided to use only the text column for the model input after testing</span>

In [None]:
df.isna().sum()


In [None]:
df = df.applymap(lambda x: x.lower() if type(x) == str else x)
df




In [None]:
modes = df.mode().iloc[0]
modes

In [None]:
df.fillna(modes, inplace=True)


In [None]:
df.isna().sum()


In [None]:
df['target'] = df['target'].astype(float)





In [None]:
df['input'] = df.text

In [None]:
ds = Dataset.from_pandas(df)


<span style="font-family: 'Arial'; font-size: 110%;">Loading a pre-trained DeBERTa-v3-large model from the Hugging Face library and creates a tokenizer for it using the AutoTokenizer class.</span>

In [None]:
model_nm = 'microsoft/deberta-v3-large'
tokz = AutoTokenizer.from_pretrained(model_nm)


<span style="font-family: 'Arial'; font-size: 110%;">The code tokenizes the input data using the DeBERTa-v3-large tokenizer and pads/truncates the sequences to a maximum length of 512 tokens. The target column is also renamed to 'labels' for compatibility with the model.</span>

In [None]:
max_length = 512
def tok_func(x): return tokz(x['input'], padding='max_length', truncation=True, max_length=max_length)



In [None]:
tok_ds = ds.map(tok_func, batched=True)


In [None]:
tok_ds = tok_ds.rename_columns({'target':'labels'})


<span style="font-family: 'Arial'; font-size: 110%;">The compute_metrics function calculates the F1 score for the model's predictions. The commented line (predictions = np.argmax(predictions, axis=-1)) is not currently being used, but was  necessary in certain situations when using the ray[tune] library.</span>

In [None]:
from sklearn.metrics import f1_score
f1_score = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    #predictions = np.argmax(predictions, axis=-1)
    return f1_score.compute(predictions=predictions, references=labels)


<span style="font-family: 'Arial'; font-size: 110%;">Function that initializes the DeBERTa-v3-large model for sequence classification.</span>



In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        'microsoft/deberta-v3-large', return_dict=True)



In [None]:
bs = 16
epochs = 2
lr = 1.56207e-05

<span style="font-family: 'Arial'; font-size: 110%;">The code sets up a hyperparameter search using Ray[tune] to maximize the F1 score of the model. The search runs 8 trials. </span>

In [None]:
training_args = TrainingArguments(
    "test", evaluation_strategy="steps", eval_steps=500, disable_tqdm=True, report_to="wandb")
trainer = Trainer(
    args=training_args,
    tokenizer=tokz,
    train_dataset=dds['train'], 
    eval_dataset=dds['test'],
    model_init=model_init,
    compute_metrics=compute_metrics,
)

trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    n_trials=8 # number of trials
)

<span style="font-family: 'Arial'; font-size: 110%;">Using KFold cross-validation to train and evaluate the model on 10 different splits of the dataset. For each split, a new model is created, trained and saved</span>

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)

for fold, (train_index, val_index) in enumerate(kf.split(tok_ds)):
    print(f"Fold {fold}")
    train_dataset = tok_ds.select(train_index)
    val_dataset = tok_ds.select(val_index)
    
    
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
    trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=val_dataset,
                  tokenizer=tokz, compute_metrics = compute_metrics)
    
    trainer.train()
    
    trainer.save_model(f"model_{fold}")


<span style="font-family: 'Arial'; font-size: 110%;">The remaining part of the notebook tokenizes the test set, makes predictions with the best model selected from cross-validation, and saves the predictions to a submission file.</span>

In [None]:
testd = "test.csv"

In [None]:
eval_df = pd.read_csv('test.csv')
eval_df['input'] = eval_df.text
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)


In [None]:
import torch

trainer.model.load_state_dict(torch.load("/notebooks/model_9/pytorch_model.bin"))
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

In [None]:
preds = np.clip(preds, 0, 1)

In [None]:
preds

In [None]:
preds = np.where(preds >= 0.5, 1, 0)


In [None]:
preds

In [None]:
preds = preds.flatten()

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'target': preds
})

submission.to_csv('submission.csv', index=False)