In [None]:
import transformers
from transformers import AutoModel, BertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import label_binarize
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data1 = pd.read_csv('final_dataset.csv')
data = data1
data.head()
#data['cleanText'].value_counts()

In [None]:
raw_dataset = load_dataset('csv', data_files = 'final_dataset.csv')

In [None]:
dataset = raw_dataset['train'].train_test_split(test_size=0.3, seed=42)
dataset


In [None]:
BERT_MODEL_NAME = 'aubmindlab/bert-base-arabertv02-twitter' 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=2)

In [None]:
'''import torch
import torch.nn as nn
from transformers import BertModel

model.classifier = nn.Sequential(
            nn.Linear(768,24),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(24, 2))'''

In [None]:
# Function to compute and return a metric report
def compute_metrics(pred, num_classes=2):
    labels = pred.label_ids
    preds = pred.predictions[:, 1]

    # Compute precision, recall, F1 score, and support for each class
    metrics_per_class = precision_recall_fscore_support(labels, (preds >= 0.5).astype(int), average=None, labels=[0, 1])

    # Compute overall accuracy, precision, recall, and F1 score
    acc = accuracy_score(labels, (preds >= 0.5).astype(int))
    overall_metrics = precision_recall_fscore_support(labels, (preds >= 0.5).astype(int), average='macro')

    # Create a classification report
    class_names = [str(i) for i in range(len(metrics_per_class[0]))]  # Replace with actual class names if available
    classification_rep = classification_report(labels, (preds >= 0.5).astype(int), target_names=class_names)

    # Compute ROC curve and AUC for binary classification
    labels_binarized = label_binarize(labels, classes=[0, 1])
    fpr, tpr, _ = roc_curve(labels_binarized, preds)
    roc_auc = auc(fpr, tpr)


    # Compute confusion matrix
    conf_matrix = confusion_matrix(labels, (preds >= 0.5).astype(int))

    # Construct the metric report
    metric_report = {
        'accuracy': acc,
        'precision': {
            'macro': overall_metrics[0]

        },
        'recall': {
            'macro': overall_metrics[1]

        },
        'f1': {
            'macro': overall_metrics[2]
        },
        'support': {
            'per_class': dict(zip(class_names, metrics_per_class[3]))
        },
        'classification_report': classification_rep,
        'roc_curve': {
            'roc_auc': roc_auc
        },
        'confusion_matrix': conf_matrix
    }

    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

    return metric_report




In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation

    weight_decay=0.01,               # strength of weight decay
    logging_strategy='epoch',
    evaluation_strategy='epoch'
    )
#    warmup_steps=500,
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
import pandas as pd
data_f =  pd.read_csv("test_data.csv")

In [None]:
import torch
predicted=[]
for text in data_f['sentence']:
  # New text to predict
  new_text = text
  # Tokenize and encode the new text
  input_ids = tokenizer.encode(new_text, return_tensors="pt")

  # Forward pass through the model
  with torch.no_grad():
      logits = model(input_ids)[0]

  # Apply softmax to get probabilities
  probs = torch.nn.functional.softmax(logits, dim=-1)

  # Threshold for binary classification
  threshold = 0.5
  predicted_class = 1 if probs[0][1] > threshold else 0
  predicted.append(predicted_class)

  #print("Predicted Class:", predicted_class)


In [None]:
actual=[]
for i in data_f['label']:
  actual.append(i)

In [None]:
len(actual),len(predicted)

In [None]:
# Calculate accuracy
correct_predictions = 0
for i in range(len(predicted)):
  if (predicted[i] == actual[i]):
    correct_predictions +=1

accuracy = correct_predictions / len(predicted)

print(f"Accuracy: {accuracy:.2%}")

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming val_y is your true labels and predictions are your predicted labels

val_y = actual
predictions = predicted


# Print Classification Report
print("Classification Report:")
print(classification_report(val_y, predictions))

# Calculate and Print ROC-AUC Score
roc_auc = roc_auc_score(val_y, predictions)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Calculate and Print Accuracy
accuracy = accuracy_score(val_y, predictions)
print(f"Accuracy: {accuracy:.4f}")

# Calculate and Print F1 Score
f1 = f1_score(val_y, predictions)
print(f"F1 Score: {f1:.4f}")

# Calculate and Print Precision
precision = precision_score(val_y, predictions)
print(f"Precision: {precision:.4f}")

# Calculate and Print Recall
recall = recall_score(val_y, predictions)
print(f"Recall: {recall:.4f}")

# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(val_y, predictions)
roc_auc_curve = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_curve))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
