### Title: Sentiment Analysis in Nigerian Pidgin English Using DistilBERT
#### Abdulkadir Bala Richard (Student ID: 3747307)
#### Chijioke Onyeka Ahanwa (Student ID: 3741164)
#### David Osawese Okundigie (Student ID: 3754299)

#### Description: This script is designed for performing natural language processing tasks using the DistilBert model.
#### The script uses the Transformers, Datasets, and Accelerate libraries to facilitate model training and evaluation.

## 1. Importing Packages

In [3]:
# Installing necessary libraries
!pip install datasets evaluate accelerate -U
!pip install transformers[torch]

# Importing essential libraries for NLP tasks
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import accelerate
import evaluate

# Checking the version of the transformers library
!pip show transformers




ModuleNotFoundError: No module named 'transformers'

## 2. Data Loading and Label Conversion

In [None]:
# Importing pandas for data manipulation
import pandas as pd

# Loading training, development, and test datasets from TSV files
# Importing pandas for data manipulation
import pandas as pd

# Loading training, development, and test datasets from TSV files
train_df = pd.read_csv('https://github.com/Syphonphilter/afrisent-semeval-2023/tree/main/data/pcm/train.tsv', sep='\t')
dev_df = pd.read_csv('https://github.com/afrisenti-semeval/afrisent-semeval-2023/blob/main/data/pcm/dev.tsv', sep='\t')
test_df = pd.read_csv('https://github.com/Syphonphilter/afrisent-semeval-2023/tree/main/data/pcm/test.tsv', sep='\t')

# Mapping textual labels to numerical format for consistency
# 'positive': 0, 'neutral': 1, 'negative': 2
label_mapping = {'positive': 0, 'neutral': 1, 'negative': 2}
train_df['label'] = train_df['label'].map(label_mapping)
dev_df['label'] = dev_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

## 3. Dataset Conversion and Tokenization

In [None]:
# Converting pandas dataframes to Hugging Face 'datasets' format
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# Initializing the tokenizer from the DistilBert model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', num_labels=3)

# Defining a function for tokenization
def tokenize_function(examples):
    # Tokenizing the text data with appropriate padding and truncation
    return tokenizer(examples['tweet'], padding='max_length', truncation=True, max_length=256)

# Applying the tokenization function to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

## 4. Metrics Computation

In [None]:
# Importing the evaluate library for metrics
import evaluate

# Loading metrics for evaluation
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

# Defining a function to compute metrics during model evaluation
def compute_metrics(p):
    # Calculating various F1 scores and accuracy
    return {
        'micro_f1': f1_metric.compute(predictions=p.predictions.argmax(-1), references=p.label_ids, average='micro'),
        'macro_f1': f1_metric.compute(predictions=p.predictions.argmax(-1), references=p.label_ids, average='macro'),
        'weighted_f1': f1_metric.compute(predictions=p.predictions.argmax(-1), references=p.label_ids, average='weighted'),
        'accuracy': accuracy_metric.compute(predictions=p.predictions.argmax(-1), references=p.label_ids)
    }


## 5. Model Initialization and Training Configuration

In [None]:
# Importing necessary classes from the transformers library
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Initializing the model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Setting up training arguments
training_args = TrainingArguments(
    # Configuring batch sizes for training and evaluation
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    # Other training arguments...
)

# Note: The '...' above indicates where additional training arguments would be specified,
# such as learning rate, number of epochs, etc. These are crucial for controlling the training process.


## 6. Training Configuration Completion and Trainer Initialization

In [None]:
# Completing the training arguments configuration
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    output_dir='./results',
    overwrite_output_dir=True,
    push_to_hub=False,
)

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    # Other possible configurations...
)

## 7. Model Training and Evaluation

In [None]:
# Trainer with compute metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Initiating the training process
trainer.train()

# Evaluating the model on the development set
results = trainer.evaluate(dev_dataset)

predicted_labels = results.predictions.argmax(-1)
true_labels = results.label_ids


# Printing evaluation results
print("Micro F1 on Test Set:", results["eval_micro_f1"])
print("Macro F1 on Test Set:", results["eval_macro_f1"])
print("Weighted F1 on Test Set:", results["eval_weighted_f1"])

# Note: Uncomment the following line to evaluate on the test dataset after finalizing the model.
# results = trainer.evaluate(test_dataset)


### Plotting Confision Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Assuming you have true labels and predictions from your model

# Generating the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plotting the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
