In [11]:
import torch
from datasets import load_dataset, DatasetDict
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [13]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [15]:
# Check if the MPS (Apple's GPU) backend is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [19]:
# Loading dataset from CSV file
train_data = load_dataset('csv', data_files='3_split_data/train.csv')
test_data = load_dataset('csv', data_files='3_split_data/test.csv')
validation_data = load_dataset('csv', data_files='3_split_data/validation.csv')
dataset = DatasetDict({
    'train': train_data['train'],
    'test': test_data['train'],
    'validation':validation_data['train']
})
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['content', 'label'],
        num_rows: 1625
    })
    test: Dataset({
        features: ['content', 'label'],
        num_rows: 434
    })
    validation: Dataset({
        features: ['content', 'label'],
        num_rows: 650
    })
})


In [21]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

Map:   0%|          | 0/1625 [00:00<?, ? examples/s]

Map:   0%|          | 0/434 [00:00<?, ? examples/s]

Map:   0%|          | 0/650 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1625
    })
    test: Dataset({
        features: ['content', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 434
    })
    validation: Dataset({
        features: ['content', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 650
    })
})


In [22]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=16)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Define the compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",   # Evaluate at the end of each epoch
    per_device_train_batch_size=8, # Adjust batch size as needed
    per_device_eval_batch_size=8,  # Adjust batch size as needed
    num_train_epochs=5,            # Number of training epochs
    weight_decay=0.01,             # Strength of weight decay
    logging_dir='./logs',          # Directory for logging
    logging_steps=10,              # Log every 10 steps
    bf16=True                      # Use BF16 precision for MPS compatibility
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics
)

# Fine-tune and evaluate the model
trainer.train()

# Retrieve training history
training_history = trainer.state.log_history

# Extract evaluation metrics and losses after each epoch
epochs = []
accuracies = []
precisions = []
recalls = []
f1_scores = []
training_losses = []
eval_losses = []

for log in training_history:
    if 'eval_accuracy' in log:  # Log evaluation metrics
        epochs.append(log['epoch'])
        accuracies.append(log['eval_accuracy'])
        precisions.append(log['eval_precision'])
        recalls.append(log['eval_recall'])
        f1_scores.append(log['eval_f1'])
    if 'loss' in log:  # Log training loss
        training_losses.append(log['loss'])
    if 'eval_loss' in log:  # Log evaluation loss
        eval_losses.append(log['eval_loss'])

# Plotting the evaluation metrics
plt.figure(figsize=(10, 6))

# Plot accuracy
plt.plot(epochs, accuracies, label='Accuracy', marker='o')
plt.plot(epochs, precisions, label='Precision', marker='o')
plt.plot(epochs, recalls, label='Recall', marker='o')
plt.plot(epochs, f1_scores, label='F1 Score', marker='o')

plt.title('Evaluation Metrics Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()

# Plotting the loss values
plt.figure(figsize=(10, 6))

# Plot training and validation loss
plt.plot(range(1, len(training_losses) + 1), training_losses, label='Training Loss', marker='o')
plt.plot(range(1, len(eval_losses) + 1), eval_losses, label='Validation Loss', marker='o')

plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Save the model
trainer.save_model('./nepali_news_mbert_fine_tuned_model')