In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from collections import Counter
import random
import time
import ast

In [2]:
from transformers import set_seed


# Set seed for reproducibility
set_seed(164)

# Set the seed for general torch operations
torch.manual_seed(164)
# Set the seed for CUDA torch operations (ones that happen on the GPU)
torch.cuda.manual_seed(164)

In [3]:
class CustomBERTWithTFIDF(nn.Module):
    def __init__(self, bert_model, num_labels, tfidf_dim=521):
        super().__init__()
        self.bert = bert_model

        # Get BERT hidden size
        self.bert_hidden_size = self.bert.config.hidden_size

        # Layers for processing BERT output
        self.bert_dropout = nn.Dropout(0.1)
        self.bert_fc = nn.Linear(self.bert_hidden_size, 256)

        # Layers for processing TF-IDF features
        self.tfidf_fc1 = nn.Linear(tfidf_dim, 128)
        self.tfidf_fc2 = nn.Linear(128, 5)  # New layer to reduce to 5 dimensions
        self.tfidf_dropout = nn.Dropout(0.1)

        # Combined layers
        self.combined_fc = nn.Linear(256 + 5, 64)  # Updated input size (256 + 5)
        self.output_layer = nn.Linear(64, num_labels)

        # Activation functions
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask, tfidf_features):
        # Process BERT inputs
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
        bert_features = self.bert_dropout(pooled_output)
        bert_features = self.relu(self.bert_fc(bert_features))

        # Process TF-IDF features
        tfidf_features = self.relu(self.tfidf_fc1(tfidf_features))
        tfidf_features = self.relu(self.tfidf_fc2(tfidf_features))  # Reduce to 5 dimensions
        tfidf_features = self.tfidf_dropout(tfidf_features)

        # Combine features
        combined = torch.cat((bert_features, tfidf_features), dim=1)
        combined = self.relu(self.combined_fc(combined))

        # Final output
        output = self.output_layer(combined)
        return output

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['content']
        label = self.data.iloc[index]['labels_encoded']

        # Convert string representation of list to actual list and then to tensor
        tfidf_features = ast.literal_eval(self.data.iloc[index]['tfidf_reduced'])
        tfidf_tensor = torch.tensor(tfidf_features, dtype=torch.float)

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'tfidf_features': tfidf_tensor,
            'labels': torch.tensor(label)
        }

# Initialize model and move to device
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-MLM-only")
base_model = AutoModel.from_pretrained("ai4bharat/IndicBERTv2-MLM-only")
model = CustomBERTWithTFIDF(base_model, num_labels=7)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

CustomBERTWithTFIDF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(250000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [4]:
train_data = pd.read_csv("/content/PS_train_tfidf_reduced.csv")
test_data = pd.read_csv("/content/PS_test_tfidf_reduced.csv")
label_encoder = LabelEncoder()
train_data['labels_encoded'] = label_encoder.fit_transform(train_data['labels'])
test_data['labels_encoded'] = label_encoder.transform(test_data['labels'])

class_counts = Counter(train_data['labels_encoded'])
total_entries = len(train_data)
weights = {label: total_entries / (class_counts[label]) for label in class_counts}
print(weights)

weights = torch.tensor([weights[label] for label in range(len(class_counts))], dtype=torch.float).to(device)

# Create data loaders
# train_dataset = CustomDataset(train_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)
# train_dataloader = DataLoader(train_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=128)

# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(weight=weights)

{1: 6.832025117739404, 6: 10.563106796116505, 3: 3.197648787656135, 4: 7.568695652173913, 5: 5.508860759493671, 0: 10.719211822660098, 2: 25.450292397660817}


In [5]:
def create_dataloaders(data, tokenizer, batch_size, seed):
    # Shuffle the DataFrame with the seed
    shuffled_data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
    dataset = CustomDataset(shuffled_data, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)  # No shuffle in DataLoader
    return dataloader

# Training loop
for epoch in range(10):
    epoch_start_time = time.time()

    # Determine batch size based on epoch
    if epoch < 2:
        batch_size = 16
    elif epoch < 4:
        batch_size = 32
    elif epoch < 6:
        batch_size = 48
    else:
        batch_size = 64

    # Create new train dataloader with incrementing seed
    current_seed = 100 + epoch
    train_dataloader = create_dataloaders(train_data, tokenizer, batch_size, current_seed)

    # Training phase
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tfidf_features = batch['tfidf_features'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask, tfidf_features)
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Evaluation phase
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            tfidf_features = batch['tfidf_features'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask, tfidf_features)
            predicted_labels = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(predicted_labels)
            true_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    report = classification_report(true_labels, predictions, digits=5)
    conf_matrix = confusion_matrix(true_labels, predictions)

    # Calculate epoch duration
    epoch_duration = time.time() - epoch_start_time

    # Print results
    print(f"Epoch {epoch+1}:")
    print(f"Batch Size: {batch_size}")
    print(f"Seed: {current_seed}")
    print(f"Average Loss: {total_loss/len(train_dataloader):.4f}")
    print(f"Total Epoch Time: {epoch_duration:.2f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1_score:.4f}")
    print("\nClassification Report:\n", report)
    print("\nConfusion Matrix:\n", conf_matrix)
    print("\n" + "="*50 + "\n")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1:
Batch Size: 16
Seed: 100
Average Loss: 1.8489
Total Epoch Time: 218.50 seconds
Accuracy: 0.1838
Precision: 0.2876
Recall: 0.1838
F1-score: 0.0841

Classification Report:
               precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000        46
           1    0.00000   0.00000   0.00000        70
           2    0.63889   0.92000   0.75410        25
           3    0.50000   0.00585   0.01156       171
           4    0.14868   0.97333   0.25795        75
           5    0.33333   0.01887   0.03571       106
           6    0.16667   0.01961   0.03509        51

    accuracy                        0.18382       544
   macro avg    0.25537   0.27681   0.15634       544
weighted avg    0.28760   0.18382   0.08410       544


Confusion Matrix:
 [[  0   0   2   1  42   0   1]
 [  0   0   3   0  66   1   0]
 [  0   0  23   0   2   0   0]
 [  0   2   4   1 162   0   2]
 [  0   1   0   0  73   1   0]
 [  0   0   4   0  98   2   2]
 [  0   0   0   0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2:
Batch Size: 16
Seed: 101
Average Loss: 1.6790
Total Epoch Time: 214.36 seconds
Accuracy: 0.2537
Precision: 0.1929
Recall: 0.2537
F1-score: 0.1749

Classification Report:
               precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000        46
           1    0.12782   0.24286   0.16749        70
           2    0.54545   0.96000   0.69565        25
           3    0.20000   0.00585   0.01136       171
           4    0.24204   0.50667   0.32759        75
           5    0.28293   0.54717   0.37299       106
           6    0.00000   0.00000   0.00000        51

    accuracy                        0.25368       544
   macro avg    0.19975   0.32322   0.22501       544
weighted avg    0.19288   0.25368   0.17493       544


Confusion Matrix:
 [[ 0 12  2  0 10 22  0]
 [ 0 17  4  1 21 27  0]
 [ 0  0 24  0  0  1  0]
 [ 0 48  6  1 46 70  0]
 [ 0 25  0  1 38 11  0]
 [ 0 22  5  0 21 58  0]
 [ 0  9  3  2 21 16  0]]


Epoch 3:
Batch Size: 32
Seed: 10

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4:
Batch Size: 32
Seed: 103
Average Loss: 1.6506
Total Epoch Time: 195.28 seconds
Accuracy: 0.2206
Precision: 0.1299
Recall: 0.2206
F1-score: 0.1432

Classification Report:
               precision    recall  f1-score   support

           0    0.07692   0.02174   0.03390        46
           1    0.07895   0.04286   0.05556        70
           2    0.95455   0.84000   0.89362        25
           3    0.00000   0.00000   0.00000       171
           4    0.22078   0.22667   0.22368        75
           5    0.20000   0.73585   0.31452       106
           6    0.00000   0.00000   0.00000        51

    accuracy                        0.22059       544
   macro avg    0.21874   0.26673   0.21732       544
weighted avg    0.12994   0.22059   0.14321       544


Confusion Matrix:
 [[  1   2   0   2   9  32   0]
 [  2   3   1   0  12  52   0]
 [  0   2  21   0   0   2   0]
 [  6  12   0   0  16 137   0]
 [  0   5   0   1  17  52   0]
 [  3  11   0   0  14  78   0]
 [  1   3   0   1