# COS 760 Research Project: Analysing Sentiments for Low-resource African Languages

## Group Members: Mihir Arjun, Troy Clark, Hamza Mokiwa

## Establishing Baselines with Monolingual Long Short-Term Memory networks(LSTMs) and pre-trained Multilingual transformers

### First we need to install the datasets

In [1]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import os
from datasets import load_from_disk, load_dataset

def load_local_datasets():
    # Define the paths to your local datasets
    swa_path = "./datasets/afrisenti/swa"
    por_path = "./datasets/afrisenti/por"
    sot_path = "./datasets/news"
    
    # Check if the directories exist
    if not all(os.path.exists(path) for path in [swa_path, por_path,sot_path]):
        print("One or more dataset directories not found. Please check the paths.")
        return None, None, None
    
    # Load datasets from disk
    print("Loading Swahili (swa) dataset from disk...")
    swa_dataset = load_from_disk(swa_path)
    print("Swahili dataset loaded!")
    
    print("Loading Portuguese (por) dataset from disk...")
    por_dataset = load_from_disk(por_path)
    print("Portuguese dataset loaded!")
    
    print("Loading Sesotho (sot) dataset from disk...")
    sot_dataset = load_dataset("csv",  data_files="datasets/sotho-news/sotho_news_dataset.csv")
    print("Sesotho dataset loaded!")

    return swa_dataset, por_dataset, sot_dataset

if __name__ == "__main__":
    swa, por, sot = load_local_datasets() #add call for Sesotho dataset here
    
    # Now you can work with these datasets
    if swa is not None:
        print(f"Swahili dataset size: {len(swa['train'])} examples")
    if por is not None:
        print(f"Portuguese dataset size: {len(por['train'])} examples")
    if sot is not None:
        print(f"Sesotho dataset size: {len(sot['train'])} examples")

Loading Swahili (swa) dataset from disk...
Swahili dataset loaded!
Loading Portuguese (por) dataset from disk...
Portuguese dataset loaded!
Loading Sesotho (sot) dataset from disk...
Sesotho dataset loaded!
Swahili dataset size: 1810 examples
Portuguese dataset size: 3063 examples
Sesotho dataset size: 2177 examples


### Now that the datasets have been loaded, we can start creating our LSTM baseline models below:

#### First we will build an LSTM model for Swahili

In [12]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# LSTM Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
BATCH_SIZE = 32
NUM_EPOCHS = 10

class SwahiliSentimentDataset(Dataset):
    def __init__(self, tweets, labels, vocab, label_map):
        self.tweets = tweets
        self.labels = labels
        self.vocab = vocab
        self.label_map = label_map
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        
        # Convert text to indices using the vocabulary
        tokenized = [self.vocab.get(word, self.vocab['<UNK>']) for word in tweet.split()]
        return torch.tensor(tokenized, dtype=torch.long), torch.tensor(self.label_map[label], dtype=torch.long)

def collate_fn(batch):
    tweets, labels = zip(*batch)
    # Pad sequences to the length of the longest sequence in the batch
    tweets_padded = pad_sequence(tweets, batch_first=True, padding_value=0)
    return tweets_padded, torch.stack(labels)

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text shape: [batch size, seq length]
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch size, seq length, embedding dim]
        
        output, (hidden, cell) = self.lstm(embedded)
        # hidden shape: [2*num_layers, batch size, hidden dim]
        
        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden shape: [batch size, hidden dim * 2]
        
        hidden = self.dropout(hidden)
        return self.fc(hidden)

def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs):
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        # Training
        model.train()
        running_loss = 0.0
        
        for tweets, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            tweets, labels = tweets.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(tweets)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for tweets, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                tweets, labels = tweets.to(device), labels.to(device)
                
                outputs = model(tweets)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')
        
        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
        
        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_swahili_lstm_model.pt")
            print("  Saved new best model!")
        
        print("-" * 60)
    
    return model

def evaluate_model(model, test_loader, criterion, device, label_list):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for tweets, labels in tqdm(test_loader, desc="Testing"):
            tweets, labels = tweets.to(device), labels.to(device)
            
            outputs = model(tweets)
            loss = criterion(outputs, labels)
            
            test_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    test_loss /= len(test_loader)
    test_accuracy = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    
    # Print detailed classification report
    class_names = [label_list[i] for i in range(len(label_list))]
    report = classification_report(all_labels, all_preds, target_names=class_names)
    print("\nClassification Report:")
    print(report)
    
    return test_loss, test_accuracy, test_f1

def main():
    # Check if GPU is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load the SwahiliSenti dataset from local storage
    try:
        swa_dataset = load_from_disk("./datasets/afrisenti/swa")
        print("Swahili dataset loaded successfully!")
        
        # Print out available columns to debug
        print(f"Available columns in train split: {swa_dataset['train'].column_names}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return
    
    # Explore dataset structure
    print(f"Dataset structure: {swa_dataset}")
    print(f"Train set size: {len(swa_dataset['train'])}")
    print(f"Test set size: {len(swa_dataset['test'])}")
    print(f"Validation set size: {len(swa_dataset['validation'])}")
    
    # Extract data - using 'tweet' column instead of 'text'
    train_tweets = swa_dataset['train']['tweet']
    train_labels = swa_dataset['train']['label']
    val_tweets = swa_dataset['validation']['tweet']
    val_labels = swa_dataset['validation']['label']
    test_tweets = swa_dataset['test']['tweet']
    test_labels = swa_dataset['test']['label']
    
    # Get unique labels and create label to index mapping
    # In AfriSenti, labels are usually 'positive', 'negative', 'neutral'
    unique_labels = set(train_labels + val_labels + test_labels)
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    print(f"Label mapping: {label_to_idx}")
    
    # Build vocabulary from training data
    word_counts = Counter()
    for tweet in train_tweets:
        word_counts.update(tweet.split())
    
    # Keep only words that appear at least 2 times
    min_freq = 2
    vocabulary = {'<PAD>': 0, '<UNK>': 1}
    vocab_idx = 2
    
    for word, count in word_counts.items():
        if count >= min_freq:
            vocabulary[word] = vocab_idx
            vocab_idx += 1
    
    print(f"Vocabulary size: {len(vocabulary)}")
    
    # Create datasets
    train_dataset = SwahiliSentimentDataset(train_tweets, train_labels, vocabulary, label_to_idx)
    val_dataset = SwahiliSentimentDataset(val_tweets, val_labels, vocabulary, label_to_idx)
    test_dataset = SwahiliSentimentDataset(test_tweets, test_labels, vocabulary, label_to_idx)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    
    # Initialize model
    model = LSTMSentiment(
        vocab_size=len(vocabulary),
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=len(unique_labels),
        n_layers=NUM_LAYERS,
        dropout=DROPOUT,
        pad_idx=vocabulary['<PAD>']
    ).to(device)
    
    # Print model architecture
    print(f"Model architecture:\n{model}")
    
    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()
    
    # Train the model
    print("Starting training...")
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=NUM_EPOCHS
    )
    
    # Load best model for evaluation
    model.load_state_dict(torch.load("best_swahili_lstm_model.pt"))
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_loss, test_accuracy, test_f1 = evaluate_model(
        model=model,
        test_loader=test_loader,
        criterion=criterion,
        device=device,
        label_list=list(idx_to_label.values())
    )
    
    # Save evaluation results
    results = {
        "test_loss": test_loss,
        "test_accuracy": test_accuracy,
        "test_f1": test_f1,
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "epochs": NUM_EPOCHS,
        "vocab_size": len(vocabulary)
    }
    
    # Save results to CSV
    pd.DataFrame([results]).to_csv("swahili_lstm_results.csv", index=False)
    print(f"Results saved to swahili_lstm_results.csv")

if __name__ == "__main__":
    main()

Using device: cpu
Swahili dataset loaded successfully!
Available columns in train split: ['tweet', 'label']
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 1810
    })
    validation: Dataset({
        features: ['tweet', 'label'],
        num_rows: 453
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 748
    })
})
Train set size: 1810
Test set size: 748
Validation set size: 453
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Vocabulary size: 3055
Model architecture:
LSTMSentiment(
  (embedding): Embedding(3055, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Starting training...


Epoch 1/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.55it/s]
Epoch 1/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 36.87it/s]


Epoch 1/10:
  Train Loss: 0.9255
  Val Loss: 0.8964, Val Accuracy: 0.5872, Val F1: 0.4377
  Saved new best model!
------------------------------------------------------------


Epoch 2/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.84it/s]
Epoch 2/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 42.44it/s]


Epoch 2/10:
  Train Loss: 0.8834
  Val Loss: 0.9085, Val Accuracy: 0.5872, Val F1: 0.4720
------------------------------------------------------------


Epoch 3/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.59it/s]
Epoch 3/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 32.75it/s]


Epoch 3/10:
  Train Loss: 0.8355
  Val Loss: 0.9070, Val Accuracy: 0.5960, Val F1: 0.5168
------------------------------------------------------------


Epoch 4/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.93it/s]
Epoch 4/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 32.35it/s]


Epoch 4/10:
  Train Loss: 0.7643
  Val Loss: 0.9697, Val Accuracy: 0.5762, Val F1: 0.4923
------------------------------------------------------------


Epoch 5/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.20it/s]
Epoch 5/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 27.65it/s]


Epoch 5/10:
  Train Loss: 0.6388
  Val Loss: 1.0302, Val Accuracy: 0.5497, Val F1: 0.5318
------------------------------------------------------------


Epoch 6/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.23it/s]
Epoch 6/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 29.00it/s]


Epoch 6/10:
  Train Loss: 0.5488
  Val Loss: 1.2454, Val Accuracy: 0.5055, Val F1: 0.5120
------------------------------------------------------------


Epoch 7/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.92it/s]
Epoch 7/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 28.39it/s]


Epoch 7/10:
  Train Loss: 0.4460
  Val Loss: 1.2486, Val Accuracy: 0.5585, Val F1: 0.5315
------------------------------------------------------------


Epoch 8/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.64it/s]
Epoch 8/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 26.75it/s]


Epoch 8/10:
  Train Loss: 0.3874
  Val Loss: 1.5211, Val Accuracy: 0.4857, Val F1: 0.4953
------------------------------------------------------------


Epoch 9/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.96it/s]
Epoch 9/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 32.68it/s]


Epoch 9/10:
  Train Loss: 0.2739
  Val Loss: 1.5772, Val Accuracy: 0.5055, Val F1: 0.5088
------------------------------------------------------------


Epoch 10/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.60it/s]
Epoch 10/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 35.11it/s]


Epoch 10/10:
  Train Loss: 0.2220
  Val Loss: 1.5910, Val Accuracy: 0.5541, Val F1: 0.5337
------------------------------------------------------------

Evaluating on test set...


Testing: 100%|██████████| 24/24 [00:00<00:00, 27.83it/s]

Test Loss: 0.8965
Test Accuracy: 0.5936
Test F1 Score: 0.4446

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        80
     neutral       0.59      1.00      0.74       444
    positive       0.50      0.00      0.01       224

    accuracy                           0.59       748
   macro avg       0.36      0.33      0.25       748
weighted avg       0.50      0.59      0.44       748

Results saved to swahili_lstm_results.csv



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Next, we build an LSTM model for Mozambican Portuguese:

In [13]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# LSTM Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
BATCH_SIZE = 32
NUM_EPOCHS = 10

class PorSentimentDataset(Dataset):
    def __init__(self, tweets, labels, vocab, label_map):
        self.tweets = tweets
        self.labels = labels
        self.vocab = vocab
        self.label_map = label_map
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        
        # Convert text to indices using the vocabulary
        tokenized = [self.vocab.get(word, self.vocab['<UNK>']) for word in tweet.split()]
        return torch.tensor(tokenized, dtype=torch.long), torch.tensor(self.label_map[label], dtype=torch.long)

def collate_fn(batch):
    tweets, labels = zip(*batch)
    # Pad sequences to the length of the longest sequence in the batch
    tweets_padded = pad_sequence(tweets, batch_first=True, padding_value=0)
    return tweets_padded, torch.stack(labels)

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text shape: [batch size, seq length]
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch size, seq length, embedding dim]
        
        output, (hidden, cell) = self.lstm(embedded)
        # hidden shape: [2*num_layers, batch size, hidden dim]
        
        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden shape: [batch size, hidden dim * 2]
        
        hidden = self.dropout(hidden)
        return self.fc(hidden)

def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs):
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        # Training
        model.train()
        running_loss = 0.0
        
        for tweets, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            tweets, labels = tweets.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(tweets)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for tweets, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                tweets, labels = tweets.to(device), labels.to(device)
                
                outputs = model(tweets)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')
        
        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
        
        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_portuguese_lstm_model.pt")
            print("  Saved new best model!")
        
        print("-" * 60)
    
    return model

def evaluate_model(model, test_loader, criterion, device, label_list):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for tweets, labels in tqdm(test_loader, desc="Testing"):
            tweets, labels = tweets.to(device), labels.to(device)
            
            outputs = model(tweets)
            loss = criterion(outputs, labels)
            
            test_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    test_loss /= len(test_loader)
    test_accuracy = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    
    # Print detailed classification report
    class_names = [label_list[i] for i in range(len(label_list))]
    report = classification_report(all_labels, all_preds, target_names=class_names)
    print("\nClassification Report:")
    print(report)
    
    return test_loss, test_accuracy, test_f1

def main():
    # Check if GPU is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load the Portuguese dataset from local storage
    try:
        por_dataset = load_from_disk("./datasets/afrisenti/por")
        print("Mozambican Portuguese dataset loaded successfully!")
        
        # Print out available columns to debug
        print(f"Available columns in train split: {por_dataset['train'].column_names}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return
    
    # Explore dataset structure
    print(f"Dataset structure: {por_dataset}")
    print(f"Train set size: {len(por_dataset['train'])}")
    print(f"Test set size: {len(por_dataset['test'])}")
    print(f"Validation set size: {len(por_dataset['validation'])}")
    
    # Extract data - using 'tweet' column instead of 'text'
    train_tweets = por_dataset['train']['tweet']
    train_labels = por_dataset['train']['label']
    val_tweets = por_dataset['validation']['tweet']
    val_labels = por_dataset['validation']['label']
    test_tweets = por_dataset['test']['tweet']
    test_labels = por_dataset['test']['label']
    
    # Get unique labels and create label to index mapping
    # In AfriSenti, labels are usually 'positive', 'negative', 'neutral'
    unique_labels = set(train_labels + val_labels + test_labels)
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    print(f"Label mapping: {label_to_idx}")
    
    # Build vocabulary from training data
    word_counts = Counter()
    for tweet in train_tweets:
        word_counts.update(tweet.split())
    
    # Keep only words that appear at least 2 times
    min_freq = 2
    vocabulary = {'<PAD>': 0, '<UNK>': 1}
    vocab_idx = 2
    
    for word, count in word_counts.items():
        if count >= min_freq:
            vocabulary[word] = vocab_idx
            vocab_idx += 1
    
    print(f"Vocabulary size: {len(vocabulary)}")
    
    # Create datasets
    train_dataset = PorSentimentDataset(train_tweets, train_labels, vocabulary, label_to_idx)
    val_dataset = PorSentimentDataset(val_tweets, val_labels, vocabulary, label_to_idx)
    test_dataset = PorSentimentDataset(test_tweets, test_labels, vocabulary, label_to_idx)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    
    # Initialize model
    model = LSTMSentiment(
        vocab_size=len(vocabulary),
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=len(unique_labels),
        n_layers=NUM_LAYERS,
        dropout=DROPOUT,
        pad_idx=vocabulary['<PAD>']
    ).to(device)
    
    # Print model architecture
    print(f"Model architecture:\n{model}")
    
    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()
    
    # Train the model
    print("Starting training...")
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=NUM_EPOCHS
    )
    
    # Load best model for evaluation
    model.load_state_dict(torch.load("best_portuguese_lstm_model.pt"))
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_loss, test_accuracy, test_f1 = evaluate_model(
        model=model,
        test_loader=test_loader,
        criterion=criterion,
        device=device,
        label_list=list(idx_to_label.values())
    )
    
    # Save evaluation results
    results = {
        "test_loss": test_loss,
        "test_accuracy": test_accuracy,
        "test_f1": test_f1,
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "epochs": NUM_EPOCHS,
        "vocab_size": len(vocabulary)
    }
    
    # Save results to CSV
    pd.DataFrame([results]).to_csv("portuguese_lstm_results.csv", index=False)
    print(f"Results saved to portuguese_lstm_results.csv")

if __name__ == "__main__":
    main()

Using device: cpu
Mozambican Portuguese dataset loaded successfully!
Available columns in train split: ['tweet', 'label']
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 3063
    })
    validation: Dataset({
        features: ['tweet', 'label'],
        num_rows: 767
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 3662
    })
})
Train set size: 3063
Test set size: 3662
Validation set size: 767
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Vocabulary size: 4075
Model architecture:
LSTMSentiment(
  (embedding): Embedding(4075, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Starting training...


Epoch 1/10 - Training: 100%|██████████| 96/96 [00:12<00:00,  7.80it/s]
Epoch 1/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 19.98it/s]


Epoch 1/10:
  Train Loss: 1.0226
  Val Loss: 0.9994, Val Accuracy: 0.5215, Val F1: 0.3575
  Saved new best model!
------------------------------------------------------------


Epoch 2/10 - Training: 100%|██████████| 96/96 [00:15<00:00,  6.37it/s]
Epoch 2/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 24.36it/s]


Epoch 2/10:
  Train Loss: 0.9703
  Val Loss: 0.9651, Val Accuracy: 0.5450, Val F1: 0.4652
  Saved new best model!
------------------------------------------------------------


Epoch 3/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.98it/s]
Epoch 3/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 24.23it/s]


Epoch 3/10:
  Train Loss: 0.8965
  Val Loss: 0.9932, Val Accuracy: 0.5567, Val F1: 0.4923
------------------------------------------------------------


Epoch 4/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.95it/s]
Epoch 4/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 20.20it/s]


Epoch 4/10:
  Train Loss: 0.7983
  Val Loss: 0.9902, Val Accuracy: 0.5606, Val F1: 0.5459
------------------------------------------------------------


Epoch 5/10 - Training: 100%|██████████| 96/96 [00:14<00:00,  6.67it/s]
Epoch 5/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 23.91it/s]


Epoch 5/10:
  Train Loss: 0.6935
  Val Loss: 1.0411, Val Accuracy: 0.5528, Val F1: 0.5396
------------------------------------------------------------


Epoch 6/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.92it/s]
Epoch 6/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 25.49it/s]


Epoch 6/10:
  Train Loss: 0.5816
  Val Loss: 1.1400, Val Accuracy: 0.5424, Val F1: 0.5358
------------------------------------------------------------


Epoch 7/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  7.09it/s]
Epoch 7/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 21.56it/s]


Epoch 7/10:
  Train Loss: 0.4691
  Val Loss: 1.2334, Val Accuracy: 0.5593, Val F1: 0.5388
------------------------------------------------------------


Epoch 8/10 - Training: 100%|██████████| 96/96 [00:14<00:00,  6.71it/s]
Epoch 8/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 23.63it/s]


Epoch 8/10:
  Train Loss: 0.3591
  Val Loss: 1.4979, Val Accuracy: 0.5332, Val F1: 0.5243
------------------------------------------------------------


Epoch 9/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  7.02it/s]
Epoch 9/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 28.58it/s]


Epoch 9/10:
  Train Loss: 0.3022
  Val Loss: 1.4727, Val Accuracy: 0.5111, Val F1: 0.5080
------------------------------------------------------------


Epoch 10/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.88it/s]
Epoch 10/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 21.51it/s]


Epoch 10/10:
  Train Loss: 0.2282
  Val Loss: 1.6321, Val Accuracy: 0.5332, Val F1: 0.5234
------------------------------------------------------------

Evaluating on test set...


Testing: 100%|██████████| 115/115 [00:04<00:00, 25.81it/s]

Test Loss: 0.8757
Test Accuracy: 0.6393
Test F1 Score: 0.5655

Classification Report:
              precision    recall  f1-score   support

    negative       0.31      0.15      0.21       655
     neutral       0.67      0.92      0.78      2379
    positive       0.62      0.08      0.14       628

    accuracy                           0.64      3662
   macro avg       0.54      0.38      0.37      3662
weighted avg       0.60      0.64      0.57      3662

Results saved to portuguese_lstm_results.csv





#### Lastly, we build an LSTM model for Sesotho

In [7]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# LSTM Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
BATCH_SIZE = 32
NUM_EPOCHS = 10

class SotSentimentDataset(Dataset):
    def __init__(self, headlines, labels, vocab, label_map):
        self.headlines = headlines
        self.labels = labels
        self.vocab = vocab
        self.label_map = label_map

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        tweet = self.headlines[idx]
        label = self.labels[idx]

        # Convert text to indices using the vocabulary
        tokenized = [self.vocab.get(word, self.vocab['<UNK>']) for word in tweet.split()]
        return torch.tensor(tokenized, dtype=torch.long), torch.tensor(self.label_map[label], dtype=torch.long)

def collate_fn(batch):
    headlines, labels = zip(*batch)
    # Pad sequences to the length of the longest sequence in the batch
    headlines_padded = pad_sequence(headlines, batch_first=True, padding_value=0)
    return headlines_padded, torch.stack(labels)

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # text shape: [batch size, seq length]
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch size, seq length, embedding dim]

        output, (hidden, cell) = self.lstm(embedded)
        # hidden shape: [2*num_layers, batch size, hidden dim]

        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden shape: [batch size, hidden dim * 2]

        hidden = self.dropout(hidden)
        return self.fc(hidden)

def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs):
    best_val_loss = float('inf')

    for epoch in range(epochs):
        # Training
        model.train()
        running_loss = 0.0

        for headlines, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            headlines, labels = headlines.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(headlines)

            # Calculate loss
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for headlines, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                headlines, labels = headlines.to(device), labels.to(device)

                outputs = model(headlines)
                loss = criterion(outputs, labels)

                val_loss += loss.item()

                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_portuguese_lstm_model.pt")
            print("  Saved new best model!")

        print("-" * 60)

    return model

def evaluate_model(model, test_loader, criterion, device, label_list):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for headlines, labels in tqdm(test_loader, desc="Testing"):
            headlines, labels = headlines.to(device), labels.to(device)

            outputs = model(headlines)
            loss = criterion(outputs, labels)

            test_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    test_loss /= len(test_loader)
    test_accuracy = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")

    # Print detailed classification report
    class_names = [label_list[i] for i in range(len(label_list))]
    report = classification_report(all_labels, all_preds, target_names=class_names)
    print("\nClassification Report:")
    print(report)

    return test_loss, test_accuracy, test_f1

def main():
    # Check if GPU is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load the Sesotho dataset from local storage
    try:
        sot_dataset = load_from_disk("./datasets/sesotho_news_dataset")
        print("Sesotho dataset loaded successfully!")

        # Print out available columns to debug
        print(f"Available columns in train split: {sot_dataset['train'].column_names}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    # Explore dataset structure
    print(f"Dataset structure: {sot_dataset}")
    print(f"Train set size: {len(sot_dataset['train'])}")
    print(f"Test set size: {len(sot_dataset['test'])}")
    print(f"Validation set size: {len(sot_dataset['validation'])}")

    # Extract data - using 'headline' column instead of 'text'
    train_headlines = sot_dataset['train']['headline']
    train_labels = sot_dataset['train']['label']
    val_headlines = sot_dataset['validation']['headline']
    val_labels = sot_dataset['validation']['label']
    test_headlines = sot_dataset['test']['headline']
    test_labels = sot_dataset['test']['label']

    # Get unique labels and create label to index mapping
    # The labels in the sesotho news dataset were originally (-1,0,1) but were changed to 'positive', 'negative', 'neutral'
    unique_labels = set(train_labels + val_labels + test_labels)
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    print(f"Label mapping: {label_to_idx}")

    # Build vocabulary from training data
    word_counts = Counter()
    for headline in train_headlines:
        word_counts.update(headline.split())

    # Keep only words that appear at least 2 times
    min_freq = 2
    vocabulary = {'<PAD>': 0, '<UNK>': 1}
    vocab_idx = 2

    for word, count in word_counts.items():
        if count >= min_freq:
            vocabulary[word] = vocab_idx
            vocab_idx += 1

    print(f"Vocabulary size: {len(vocabulary)}")

    # Create datasets
    train_dataset = SotSentimentDataset(train_headlines, train_labels, vocabulary, label_to_idx)
    val_dataset = SotSentimentDataset(val_headlines, val_labels, vocabulary, label_to_idx)
    test_dataset = SotSentimentDataset(test_headlines, test_labels, vocabulary, label_to_idx)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    # Initialize model
    model = LSTMSentiment(
        vocab_size=len(vocabulary),
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=len(unique_labels),
        n_layers=NUM_LAYERS,
        dropout=DROPOUT,
        pad_idx=vocabulary['<PAD>']
    ).to(device)

    # Print model architecture
    print(f"Model architecture:\n{model}")

    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    # Train the model
    print("Starting training...")
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=NUM_EPOCHS
    )

    # Save the model's state_dict
    torch.save(model.state_dict(), "best_sesotho_lstm_model.pt");

    # Load best model for evaluation
    model.load_state_dict(torch.load("best_sesotho_lstm_model.pt"))

    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_loss, test_accuracy, test_f1 = evaluate_model(
        model=model,
        test_loader=test_loader,
        criterion=criterion,
        device=device,
        label_list=list(idx_to_label.values())
    )

    # Save evaluation results
    results = {
        "test_loss": test_loss,
        "test_accuracy": test_accuracy,
        "test_f1": test_f1,
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "epochs": NUM_EPOCHS,
        "vocab_size": len(vocabulary)
    }

    # Save results to CSV
    pd.DataFrame([results]).to_csv("sesotho_lstm_results.csv", index=False)
    print(f"Results saved to sesotho_lstm_results.csv")

if __name__ == "__main__":
    main()

Using device: cpu
Sesotho dataset loaded successfully!
Available columns in train split: ['headline', 'label', '__index_level_0__']
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 1305
    })
    validation: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 436
    })
    test: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 436
    })
})
Train set size: 1305
Test set size: 436
Validation set size: 436
Label mapping: {'neutral': 0, 'negative': 1, 'positive': 2}
Vocabulary size: 876
Model architecture:
LSTMSentiment(
  (embedding): Embedding(876, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Starting training...


Epoch 1/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 50.00it/s]
Epoch 1/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 212.12it/s]


Epoch 1/10:
  Train Loss: 0.7751
  Val Loss: 0.7289, Val Accuracy: 0.6950, Val F1: 0.5699
  Saved new best model!
------------------------------------------------------------


Epoch 2/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 51.51it/s]
Epoch 2/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 202.90it/s]


Epoch 2/10:
  Train Loss: 0.6275
  Val Loss: 0.7368, Val Accuracy: 0.7179, Val F1: 0.6419
------------------------------------------------------------


Epoch 3/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 52.90it/s]
Epoch 3/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 215.37it/s]


Epoch 3/10:
  Train Loss: 0.5516
  Val Loss: 0.7012, Val Accuracy: 0.7317, Val F1: 0.6920
  Saved new best model!
------------------------------------------------------------


Epoch 4/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 56.79it/s]
Epoch 4/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 200.00it/s]


Epoch 4/10:
  Train Loss: 0.5039
  Val Loss: 0.7370, Val Accuracy: 0.7385, Val F1: 0.6711
------------------------------------------------------------


Epoch 5/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 57.91it/s]
Epoch 5/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 212.13it/s]


Epoch 5/10:
  Train Loss: 0.4261
  Val Loss: 0.7007, Val Accuracy: 0.7271, Val F1: 0.7154
  Saved new best model!
------------------------------------------------------------


Epoch 6/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 55.86it/s]
Epoch 6/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 215.39it/s]


Epoch 6/10:
  Train Loss: 0.3852
  Val Loss: 0.7299, Val Accuracy: 0.7638, Val F1: 0.7379
------------------------------------------------------------


Epoch 7/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 59.33it/s]
Epoch 7/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 218.75it/s]


Epoch 7/10:
  Train Loss: 0.2882
  Val Loss: 0.7523, Val Accuracy: 0.7454, Val F1: 0.7287
------------------------------------------------------------


Epoch 8/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 58.32it/s]
Epoch 8/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 218.74it/s]


Epoch 8/10:
  Train Loss: 0.2573
  Val Loss: 0.7572, Val Accuracy: 0.7156, Val F1: 0.7123
------------------------------------------------------------


Epoch 9/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 57.66it/s]
Epoch 9/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 199.99it/s]


Epoch 9/10:
  Train Loss: 0.1974
  Val Loss: 0.8882, Val Accuracy: 0.7271, Val F1: 0.7143
------------------------------------------------------------


Epoch 10/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 53.81it/s]
Epoch 10/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 222.22it/s]


Epoch 10/10:
  Train Loss: 0.1676
  Val Loss: 0.9858, Val Accuracy: 0.7385, Val F1: 0.7134
------------------------------------------------------------

Evaluating on test set...


Testing: 100%|██████████| 14/14 [00:00<00:00, 237.29it/s]

Test Loss: 0.9303
Test Accuracy: 0.7202
Test F1 Score: 0.7024

Classification Report:
              precision    recall  f1-score   support

     neutral       0.83      0.25      0.38        20
    negative       0.77      0.87      0.82       308
    positive       0.49      0.38      0.43       108

    accuracy                           0.72       436
   macro avg       0.70      0.50      0.54       436
weighted avg       0.71      0.72      0.70       436

Results saved to sesotho_lstm_results.csv





Saving the dataset (1/1 shards): 100%|██████████| 1305/1305 [00:00<00:00, 326273.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 436/436 [00:00<00:00, 109014.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 436/436 [00:00<00:00, 145297.68 examples/s]


### We now create Baselines with pre-trained Multilingual transformers

#### Baseline with [mBERT](https://huggingface.co/google-bert/bert-base-multilingual-cased)

#### Baseline with [XLM-RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/xlm-roberta)

#### Baseline with [AfroXLMR](https://huggingface.co/Davlan/afro-xlmr-large)


### We now perform fine-tuning on the Multilingual transformers


#### Fine-tuning [mBERT](https://huggingface.co/google-bert/bert-base-multilingual-cased)


#### Fine-tuning [XLM-RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/xlm-roberta)

#### Fine-tuning [XLM-RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/xlm-roberta)

## We now implement and evaluate the effect of Transfer learning on Transformer performance

## We now implement and evaluate the effects of several Augmentation Strategies

### Firstly, we explore the effects of Back-Translation on model performance when analysing the Mozambican Portuguese data

### Next, we explore the effects of Contextual Substitution on model performance when analysing the Mozambican Portuguese data


### Next, we explore the effects of adapters on model performance
This will be through using Cross-Lingual transfer learning (CLTL) with Swahili data with the aim of improving model performance on Sesotho.
This decision was made because...(sesotho has less resources for example)


### Lastly, we use LLM-based synthetic data generation with the aim of addressing class imbalance


## This section seeks to use attention visualise and feature attribution to find language-specific sentiment patterns
Furthermore, we explore how linguistic nuances within our affect classification across languages. There is a particular focus on Sesotho models because....

### SHAP
Link: [SHAP docs](https://shap.readthedocs.io/en/latest/)

### LIME
Link: [LIME docs](https://uc-r.github.io/lime)