In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json


In [None]:
cd /news

In [None]:
files = {
    "Economy": "Economy_40k.csv",
    "National": "National_40k.csv",
    "Education": "Education_40k.csv",
    "ScienceTechnology": "ScienceTechnology_40k.csv",
    "Entertainment": "Entertainment_40k.csv",
    "Sports": "Sports_40k.csv",
    "International": "International_40k.csv",
    "Politics": "politics_40k.csv",
}

 
dataframes = [pd.read_csv(file).assign(label=label) for label, file in files.items()]
combined_df = pd.concat(dataframes, ignore_index=True)
 
label_encoder = LabelEncoder()
combined_df["label"] = label_encoder.fit_transform(combined_df["label"])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
 

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    combined_df["article"], combined_df["label"], test_size=0.2, random_state=42
)

# Reduce dataset size for training
train_texts, test_texts, train_labels, test_labels = (
    train_texts[:2000], test_texts[:2000], train_labels[:2000], test_labels[:2000]
)


In [None]:
def torch_kron(a, b):
    a_shape = [a.size(0), a.size(1)]
    b_shape = [b.size(0), b.size(1)]
    return torch.reshape(torch.reshape(a, [a_shape[0], 1, a_shape[1], 1]) * torch.reshape(b, [1, b_shape[0], 1, b_shape[1]]), [a_shape[0] * b_shape[0], a_shape[1] * b_shape[1]])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  

In [None]:
class Pretraining_model(nn.Module):  
    def __init__(self,  model_name):
        super().__init__()
        self.bert = AutoModelForMaskedLM.from_pretrained(model_name) 
        self.head = nn.Linear(self.bert.config.hidden_size,self.bert.config.hidden_size )

    def forward(self, input_ids, attention_mask,labels):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask,  labels=labels,output_hidden_states=True)
        cls_output = outputs.hidden_states[-1][:, 0, :]#outputs.last_hidden_state[:, 0, :]
        linear_transformation=self.head(cls_output)
        return linear_transformation,outputs.loss#logits

In [None]:
#Supervised Contrastive-Masked Pretraining

In [None]:
model_path = "/home2/ss23aga/distilbert-base-multilingual-cased"
model = Pretraining_model(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

train_text_list = train_texts.tolist()
train_labels_array = np.array(train_labels)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)

num_classes = 8
feature_dim = 768
target_labels = torch.arange(num_classes, dtype=torch.int64, device=device)
ones_column = torch.ones((num_classes, 1), dtype=torch.float32, device=device)
ones_row = torch.ones((1, num_classes), dtype=torch.float32, device=device)

identity_matrix = np.identity(num_classes)
L = np.kron(identity_matrix, np.ones((feature_dim, 1)))
kronecker_tensor = torch.tensor(L, dtype=torch.float32, device=device)

num_epochs = 3
num_inner_iterations = 4

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()

    for _ in range(num_inner_iterations):
        sample_indices = []
        selected_class_order = np.random.choice(num_classes, num_classes, replace=False)
        
        for class_idx in selected_class_order:
            class_indices = torch.nonzero(
                torch.tensor(train_labels_array) == class_idx, as_tuple=False
            ).squeeze(-1)
            rand_choice = np.random.choice(class_indices.size(0), 2, replace=False)
            sample_indices.extend(class_indices[rand_choice].tolist())

        train_text_subset = [train_text_list[i] for i in sample_indices]
        encoding = tokenizer(
            train_text_subset,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        input_ids = encoding.input_ids.clone()
        labels = input_ids.clone()
        probability_matrix = torch.full(labels.shape, 0.3)
        masked_indices = torch.bernoulli(probability_matrix).bool() & (
            input_ids != tokenizer.pad_token_id
        )
        labels[~masked_indices] = -100
        input_ids[masked_indices] = tokenizer.mask_token_id
        attention_mask = encoding["attention_mask"].squeeze(0)

        model_outputs, mlm_loss = model(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            labels=labels.to(device)
        )

        t1 = model_outputs[0:num_classes]
        t2 = model_outputs[num_classes:2 * num_classes]

        t1_reshaped = t1.reshape(1, num_classes * feature_dim)
        kron_result = torch.kron(ones_row, t2)
        z = torch.matmul(ones_column, t1_reshaped) - kron_result
        logits = -torch.matmul(torch.square(z), kronecker_tensor)

        iteration_loss = loss_fn(logits, target_labels) + mlm_loss
        total_loss += iteration_loss

    total_loss.backward()
    optimizer.step()

    if epoch % 1 == 0:
        print(f"Epoch {epoch} - Loss: {total_loss.item()}")

model.bert.save_pretrained("pretrained_distilbert_scmp")


In [None]:
#Fine tuning with joint loss function

In [None]:
 class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        """
        texts: List of text strings.
        labels: List or array of labels.
        tokenizer: Hugging Face tokenizer.
        max_length: Maximum sequence length for tokenization.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

   

    def __getitem__(self, idx):
        text = self.texts[idx]
     
        label = self.labels[idx]
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

class ClassifierModel(nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        """
        pretrained_model_name: Name or path of the pretrained checkpoint.
        num_labels: Number of target classes.
        """
        super(ClassifierModel, self).__init__()
        self.bert = AutoModelForMaskedLM.from_pretrained(pretrained_model_name)
        self.head = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels):
        """
        input_ids: Tokenized input IDs.
        attention_mask: Attention mask for inputs.
        labels: Labels for the masked language modeling (MLM) task.
        Returns:
            logits: Classification logits.
            mlm_loss: Loss computed by the MLM head.
        """
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True
        )
  
        cls_output = outputs.hidden_states[-1][:, 0, :]
        logits = self.head(cls_output)
        return logits, outputs.loss

def train_model(model, dataloader, optimizer, loss_fn, device):
    """
    Trains the model for one epoch.
    
    model: The classification model.
    dataloader: DataLoader for the training data.
    optimizer: Optimizer.
    loss_fn: Loss function for classification.
    device: Device ("cuda" or "cpu").
    
    Returns:
        Average training loss.
    """
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad()
        
        # Move inputs to device
        input_ids = batch["input_ids"]#.to(device)
        attention_mask = batch["attention_mask"]#.to(device)
        
        # Prepare MLM labels: copy input_ids and mask tokens with probability 0.3
        mlm_labels = input_ids.clone()
        probability_matrix = torch.full(mlm_labels.shape, 0.3)
        masked_indices = torch.bernoulli(probability_matrix).bool() & (input_ids != tokenizer.pad_token_id)
        
        mlm_labels[~masked_indices] = -100  # -100 is ignored in loss computation
        masked_input_ids = input_ids.clone()
        masked_input_ids[masked_indices] = tokenizer.mask_token_id
        
        # Classification labels (unchanged)
        classification_labels = batch["label"].to(device)
        
        # Forward pass
        logits, mlm_loss = model(
            input_ids=masked_input_ids.to(device),
            attention_mask=attention_mask.to(device),
            labels=mlm_labels.to(device)
        )
        loss = loss_fn(logits, classification_labels) + mlm_loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        
        
        
        
    return total_loss / len(dataloader)

 
def evaluate_model(model, dataloader, device, loss_fn):
    """
    Evaluates the model on a validation set.
    
    model: The classification model.
    dataloader: DataLoader for the validation data.
    device: Device ("cuda" or "cpu").
    loss_fn: Loss function for classification.
    
    Returns:
        Average loss, list of predictions, and list of true labels.
    """
    model.eval()
    total_loss = 0.0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"]#.to(device)
            attention_mask = batch["attention_mask"]#.to(device)
            
            # Prepare MLM labels for evaluation (using a lower masking probability)
            mlm_labels = input_ids.clone()
            probability_matrix = torch.full(mlm_labels.shape, 0.003)
            masked_indices = torch.bernoulli(probability_matrix).bool() & (input_ids != tokenizer.pad_token_id)
            #masked_indices = torch.bernoulli(probability_matrix).bool() & (input_ids != torch.tensor(tokenizer.pad_token_id, device=device))

            mlm_labels[~masked_indices] = -100
            masked_input_ids = input_ids.clone()
            masked_input_ids[masked_indices] = tokenizer.mask_token_id
            
            classification_labels = batch["label"].to(device)
            logits, _ = model(
                input_ids=masked_input_ids.to(device),
                attention_mask=attention_mask.to(device),
                labels=mlm_labels.to(device)
            )
            batch_loss = loss_fn(logits, classification_labels)
            total_loss += batch_loss.item()
            
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy())
            true_labels.extend(classification_labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss, predictions, true_labels

 
if __name__ == "__main__":
 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     
    train_text_list = train_texts.tolist()          
    val_text_list = test_texts.tolist()               
    train_labels_array = np.array(train_labels)       
    val_labels_array = np.array(test_labels)         
    
  
    base_model_checkpoint = "distilbert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(base_model_checkpoint)
 
    model_checkpoint = "pretrained_distilbert_scmp"
    
 
    train_dataset = TextDataset(train_text_list, train_labels_array, tokenizer)
    val_dataset = TextDataset(val_text_list, val_labels_array, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    
 
    num_labels = 8
    model = ClassifierModel(model_checkpoint, num_labels=num_labels).to(device)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_fn = nn.CrossEntropyLoss()
    
   
    num_epochs = 400
    metrics_history = []
    
    
    
    
    
  
    
    for epoch in range(num_epochs):
        train_loss = train_model(model, train_dataloader, optimizer, loss_fn, device)
        print(f"Epoch {epoch }/{num_epochs}, Training Loss: {train_loss:.4f}")
        
        if (epoch+1)%10==0:
            val_loss, val_predictions, val_true_labels = evaluate_model(model, val_dataloader, device, loss_fn)
            val_accuracy = accuracy_score(val_true_labels, val_predictions)
            class_report = classification_report(val_true_labels, val_predictions, output_dict=True)

            metrics_history.append({
                "epoch": epoch + 1,
                "val_loss": val_loss,
                "val_accuracy": val_accuracy,
                "classification_report": class_report
            })

            print(f"Epoch {epoch + 1}: Validation Loss = {val_loss:.4f}, Validation Accuracy = {val_accuracy:.4f}")
            print(classification_report(val_true_labels, val_predictions))

 
    metrics_df = pd.DataFrame([
        {k: v for k, v in entry.items() if k != "classification_report"}
        for entry in metrics_history
    ])
    metrics_df.to_csv("metrics_history.csv", index=False)
 
    with open("classification_reports.json", "w") as f:
        json.dump(
            [entry["classification_report"] for entry in metrics_history],
            f,
            indent=4
        )


In [None]:
#plotting the validation loss and validation accuracy

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the metrics history CSV file
df = pd.read_csv("metrics_history.csv")

# ICLR-style formatting
plt.rcParams.update({
    "font.family": "serif",
    "font.size": 10,
    "axes.labelsize": 10,
    "axes.titlesize": 11,
    "xtick.labelsize": 9,
    "ytick.labelsize": 9,
    "legend.fontsize": 9,
    "lines.linewidth": 1.5,
    "lines.markersize": 5
})

# Plot Validation Loss Curve
plt.figure(figsize=(4, 2.5))
plt.plot(df["epoch"], df["val_loss"], marker="o", linestyle="-", label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Validation Loss Curve")
plt.legend()
plt.grid(True, linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.savefig("validation_loss_curve.pdf", bbox_inches="tight")
plt.show()

# Plot Validation Accuracy Curve
plt.figure(figsize=(4, 2.5))
plt.plot(df["epoch"], df["val_accuracy"], marker="s", linestyle="-", label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy Curve")
plt.legend()
plt.grid(True, linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.savefig("validation_accuracy_curve.pdf", bbox_inches="tight")
plt.show()
