In [7]:
import os
import pandas as pd
import torch
import numpy as np
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import torch.nn as nn
from torch.cuda.amp import GradScaler, autocast
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# BERT model for multi-label narrative classification

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



In [10]:
class NarrativeClassificationBERT(nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        super(NarrativeClassificationBERT, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)

        # Classification head for multi-label classification
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use the pooled output
        logits = self.classifier(pooled_output)
        return logits

# Load annotations from Datset Provided
def load_narrative_annotations(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            fields = line.strip().split('\t')
            article_id = fields[0]
            narratives = fields[1].split(';')  # Multi-label narratives
            data.append([article_id, narratives])
    return pd.DataFrame(data, columns=["article_id", "narratives"])

# Load and preprocess narrative data
annotations = load_narrative_annotations("/content/subtask-2-annotations.txt")

def load_all_articles(raw_documents_folder):
    articles = {}
    for filename in os.listdir(raw_documents_folder):
        if filename.endswith(".txt"):
            article_id = filename.split('.')[0]
            with open(os.path.join(raw_documents_folder, filename), 'r', encoding='utf-8') as f:
                articles[article_id] = f.read()
    return articles

articles = load_all_articles("/content/sample_data/raw_documents")

# Preprocessing the data and encoding narratives
def preprocess_narrative_data(annotations, articles):
    data = []
    mlb = MultiLabelBinarizer()  # Encoder for multi-label narratives

    # Fit the MultiLabelBinarizer on all narratives
    all_narratives = [narrative for narratives in annotations['narratives'] for narrative in narratives]
    mlb.fit([all_narratives])

    for _, row in annotations.iterrows():
        article_id = row['article_id'].split('.')[0]
        if article_id in articles:
            text = articles[article_id]
            inputs = tokenizer(
                text, padding='max_length', max_length=512, truncation=True, return_tensors="pt"
            ).to(device)

            # Encode the narratives as a binary vector
            narrative_labels = mlb.transform([row['narratives']])[0]
            data.append((inputs, torch.tensor(narrative_labels, dtype=torch.float).to(device)))

    return data, mlb

train_data, mlb = preprocess_narrative_data(annotations, articles)

# Create Dataset class for PyTorch
class NarrativeDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs, labels = self.data[idx]
        return (
            inputs['input_ids'].squeeze(0),
            inputs['attention_mask'].squeeze(0),
            labels
        )

# Split data and create DataLoaders
train_data, val_data = train_test_split(train_data, test_size=0.25)
train_loader = DataLoader(NarrativeDataset(train_data), batch_size=4, shuffle=True)
val_loader = DataLoader(NarrativeDataset(val_data), batch_size=2, shuffle=False)

# Initialize model, optimizer, and loss function
num_narratives = len(mlb.classes_)
model = NarrativeClassificationBERT('bert-base-multilingual-cased', num_narratives).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
loss_fn = nn.BCEWithLogitsLoss()  # For multi-label classification
scaler = GradScaler()  # Mixed precision training

# Training and validation
def train_and_validate():
    for epoch in range(3):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        print(f"\nEpoch {epoch + 1}")

        # Training loop
        for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
            optimizer.zero_grad()

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            with autocast():
                logits = model(input_ids, attention_mask)
                loss = loss_fn(logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

            # Store predictions and labels for F1 Score calculation
            preds = torch.sigmoid(logits).cpu().detach().numpy()
            all_preds.extend((preds > 0.5).astype(int))
            all_labels.extend(labels.cpu().numpy())

        # Calculate and print Training F1 Score
        train_f1 = f1_score(np.array(all_labels), np.array(all_preds), average='micro')
        print(f"Training F1 Score after Epoch {epoch + 1}: {train_f1}")

        # Validation loop
        model.eval()
        all_val_preds = []
        all_val_labels = []

        with torch.no_grad():
            for input_ids, attention_mask, labels in val_loader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                with autocast():
                    logits = model(input_ids, attention_mask)

                val_preds = torch.sigmoid(logits).cpu().numpy()
                all_val_preds.extend((val_preds > 0.5).astype(int))
                all_val_labels.extend(labels.cpu().numpy())

        # Calculate and print Validation F1 Score
        val_f1 = f1_score(np.array(all_val_labels), np.array(all_val_preds), average='micro')
        print(f"Validation F1 Score after Epoch {epoch + 1}: {val_f1}")
# Save the fine-tuned model
def save_model():
    torch.save(model.state_dict(), "narrative_classification_model.pth")
    tokenizer.save_pretrained("narrative_tokenizer")
    print("Model and tokenizer saved successfully!")

# Train and validate the model
train_and_validate()

# Save the model
save_model()


  scaler = GradScaler()  # Mixed precision training
  with autocast():



Epoch 1
Training F1 Score after Epoch 1: 0.19338422391857507


  with autocast():


Validation F1 Score after Epoch 1: 0.0

Epoch 2


  with autocast():


Training F1 Score after Epoch 2: 0.25149700598802394


  with autocast():


Validation F1 Score after Epoch 2: 0.34146341463414637

Epoch 3


  with autocast():


Training F1 Score after Epoch 3: 0.17956656346749225


  with autocast():


Validation F1 Score after Epoch 3: 0.34146341463414637
Model and tokenizer saved successfully!
