In [1]:
import regex as re
import torch
import torch.nn as nn
import pandas as pd
from datasets import load_dataset
from normalizer import normalize
from transformers.modeling_outputs import TokenClassifierOutput, SequenceClassifierOutput
from transformers import AutoModelForSequenceClassification, AutoModel, AutoConfig

In [2]:
id2label = {
    0 : 'Non-Violence',
    1 : 'Passive Violence',
    2 : 'Direct Violence'
}

label2id = {
    'Non-Violence': 0,
    'Passive Violence': 1,
    'Direct Violence': 2
}

training_data = load_dataset("csv", data_files={'train': ['./data_gen/train_paraphrased.csv'],'validation': ['./data_gen/validation_data.csv']})

Found cached dataset csv (/home/rohan/.cache/huggingface/datasets/csv/default-f391624b81d64ce9/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from sklearn.metrics import accuracy_score, f1_score 

# Helper Functions
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_macro = f1_score(labels, preds, average="macro") 
    f1_micro = f1_score(labels, preds, average="micro") 
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1_weighted": f1_weighted, "f1_macro": f1_macro, "f1_micro": f1_micro}


In [4]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch

num_labels = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model_ckpt = "./csebuetnlp/banglabert-sentiment-analysis/checkpoint-4000/"
model_ckpt = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, model_max_length=256) # local_files_only=True

model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)) # local_files_only=True
training_data_encoded = training_data.map(tokenize, batched=True, batch_size=64)

#model = freeze_electra_layers(model, num_layers_to_freeze=4)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading cached processed dataset at /home/rohan/.cache/huggingface/datasets/csv/default-f391624b81d64ce9/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-e7aacc2f3aa59853.arrow
Loading cached processed dataset at /home/rohan/.cache/huggingface/datasets/csv/default-f391624b81d64ce9/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-553d781adccae55a.arrow


In [5]:
from transformers import Trainer, TrainingArguments 
batch_size = 32
logging_steps = len(training_data_encoded["train"]) // batch_size 
model_name = f"{model_ckpt}-sentiment-analysis"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=10, 
                                  #num_training_steps=100, 
                                  learning_rate=2e-5, #2e-4
                                  #warmup_steps=10000,
                                  per_device_train_batch_size=batch_size, 
                                  per_device_eval_batch_size=batch_size, 
                                  eval_steps=100,
                                  logging_steps = 100,
                                  weight_decay=0.01, #1e-3, 1e-5
                                  evaluation_strategy="steps", 
                                  save_strategy="steps",
                                  disable_tqdm=False,
                                  #logging_steps=logging_steps,
                                  save_steps = 100, 
                                  save_total_limit = 2,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1_macro",   #"loss",
                                  greater_is_better=True, #False
                                  optim='adamw_torch',
                                  lr_scheduler_type= "linear", #cosine_with_restarts # cosine
                                  log_level="error")

In [6]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
%%capture
#!sudo apt-get install git-lfs
from transformers import Trainer

import optuna

def objective(trial):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 2e-5, 2e-4, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    per_device_train_batch_size = trial.suggest_int("per_device_train_batch_size", 32, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)

    # Update training args with sampled hyperparameters
    training_args.learning_rate = learning_rate
    training_args.num_train_epochs = num_train_epochs
    training_args.per_device_train_batch_size = per_device_train_batch_size
    training_args.weight_decay = weight_decay

    trainer = Trainer(model=model, 
                      args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=training_data_encoded["train"], 
                      eval_dataset=training_data_encoded["validation"],
                      data_collator=data_collator,
                      tokenizer=tokenizer,
                      #class_weights=class_weights 
                     )
    
    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()

    # You seem to want to optimize for "f1_macro". Return this metric
    return eval_results["eval_f1_macro"]

In [9]:
#study = optuna.create_study(direction="maximize")  # Maximize f1_macro
#study.optimize(objective, n_trials=15)  # for example, run for 50 trials

# Print best hyperparameters
#print(f"Best trial:\n  Value: {study.best_value}\n  Params: {study.best_params}")

In [None]:
#print(f"Best trial:\n  Value: {study.best_value}\n  Params: {study.best_params}")

In [None]:
#print(f"Best trial:\n  Value: {study.best_value}\n  Params: {study.best_params}")

In [10]:
trainer.train()

NameError: name 'trainer' is not defined

In [None]:
preds_output = trainer.predict(training_data_encoded["validation"])
preds_output.metrics

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix 
def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize='pred') 
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False) 
  plt.title("Normalized confusion matrix: Validation")
  plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

y_preds = np.argmax(preds_output.predictions, axis=1)
#labels = training_data_encoded["train"].features["label"].names
labels = [0,1,2]
y_valid = training_data_encoded["validation"]["label"]

plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
t = pd.read_csv('./data_gen/backtranslated_train.csv')
t.label.hist()

In [None]:
test = pd.read_csv('./data/test.csv')

In [None]:
test

In [None]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Move to GPU if available
    if torch.cuda.is_available():
        for key in inputs:
            inputs[key] = inputs[key].cuda()

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
    return predictions.item()  # Assuming one prediction per call


In [None]:
test['label'] = test['text'].apply(lambda x: predict(x, model, tokenizer))
test

In [None]:
# Extract necessary columns for submission
#submission = test[['id', 'label']]

# Save the dataframe to a CSV file
test.to_csv('./submissions/task.csv', index=False)