# Fine tuning Pipeline first version

We want to test how the models performs. At this point in time the amount of memory needed is to large so we have not been able to run it

### Importing & Packages

In [3]:
# Imports
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from evaluate import load

### Part 1: Load & Preprocess Dataset

In [None]:
# reading csv
df = pd.read_csv("PrimaryVPHostileLabels.csv")

# checking for na's
df = df[~df['hostility_label'].isna()]

# Removing unlabelled data
df['hostility_label'] = df['hostility_label'].astype(int)


In [16]:
## Converting into a transformers package df
## Renaming labels

def load_and_prepare_data(df, hostile_label_column_name = 'hostility_label'):
    df = df[['statement', hostile_label_column_name]]
    dataset = Dataset.from_pandas(df)
    dataset = dataset.rename_column("hostility_label", "labels")
    return dataset

### Part 2: Tokenization

In [17]:
def tokenize_data(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(
            examples["statement"],
            padding="max_length",
            truncation=True,
            max_length=512
        )
    return dataset.map(tokenize_function, batched=True)

### Part 3: Splitting Training/Validation

In [18]:
def split_dataset(dataset, test_size=0.2, seed=42):
    split_data = dataset.train_test_split(test_size=test_size, seed=seed)
    return split_data["train"], split_data["test"]



### Part 4: Load Model & Tokenizer



In [19]:
def load_model_and_tokenizer(model_checkpoint="GroNLP/hateBERT"):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    return model, tokenizer

### Part 5: Metric Set up

In [20]:
def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


### Training Setup

In [24]:
def setup_training_args(output_dir="./output/hatebert_finetuned"):
    return TrainingArguments(
        output_dir="output_dir",           
        save_strategy="epoch",
        eval_strategy='epoch' ,  
        learning_rate=2e-5, 
        per_device_train_batch_size=16, 
        per_device_eval_batch_size=32,
        num_train_epochs=4,
        weight_decay=0.01,
        logging_dir="./output/logs", 
        logging_steps=50,
        load_best_model_at_end=True, 
        metric_for_best_model="eval_loss",
        save_total_limit=2, 
        report_to="none",
        disable_tqdm=False # progress bar
    )

### Full Pipeline

In [25]:
def run_pipeline(df):
    model, tokenizer = load_model_and_tokenizer()
    dataset = load_and_prepare_data(df)
    tokenized_dataset = tokenize_data(dataset, tokenizer)
    train_dataset, val_dataset = split_dataset(tokenized_dataset)
    training_args = setup_training_args()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return model, tokenizer

In [26]:
model, tokenizer = run_pipeline(df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 5865/5865 [00:00<00:00, 8386.03 examples/s]


KeyboardInterrupt: 