In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertModel, get_scheduler
from torch.optim import AdamW
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import re
from google.colab import drive

drive.mount('/content/drive')

checkpoint_dir = '/content/drive/MyDrive/bert_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
merged_path = '/content/drive/MyDrive/bert_checkpoints/merged.csv'

if os.path.exists(merged_path):
    os.remove(merged_path)

chunksize = 10000
for chunk in pd.read_csv('/content/drive/MyDrive/test_model_data.csv', chunksize=chunksize):
    chunk['text'] = chunk[['html_title', 'h1', 'h2', 'p']].fillna('').agg(' '.join, axis=1)

    label_cols = chunk.columns.difference(['html_title', 'h1', 'h2', 'p', 'text'])
    chunk[label_cols] = chunk[label_cols].fillna(0).astype(int)

    chunk[['text'] + label_cols.tolist()].to_csv(merged_path, mode='a', index=False, header=not os.path.exists(merged_path))

In [3]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [4]:
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_labels=3444):
        super(BertMultiLabelClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.bert.gradient_checkpointing_enable()  # Enable gradient checkpointing
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.classifier(x)
        return logits


In [5]:
# Cell 5: Tokenizer, Model & Optimizer Setup (no full dataset load)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
num_labels = 3444  # Set your correct number of labels here
model = BertMultiLabelClassifier(num_labels=num_labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()
epochs = 10

# Load from checkpoint if available
best_f1 = 0.0
start_epoch = 0
checkpoint_files = [f for f in os.listdir(checkpoint_dir) if re.match(r'checkpoint_epoch_\\d+\\.pt', f)]
if checkpoint_files:
    latest_checkpoint = max(checkpoint_files, key=lambda x: int(re.findall(r'\\d+', x)[0]))
    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Resumed from checkpoint: {latest_checkpoint} (epoch {start_epoch})")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Cell 6: Training Loop with Chunked Loading
for epoch in range(start_epoch, epochs):
    model.train()
    total_loss = 0

    for chunk in pd.read_csv(merged_path, chunksize=10000):
        texts = chunk['text'].tolist()
        labels = chunk.drop(columns=['text']).values

        train_dataset = CustomDataset(texts, labels, tokenizer, max_len=512)
        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)

        total_steps = len(train_loader)
        scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        del train_dataset, train_loader, texts, labels
        torch.cuda.empty_cache()

    print(f"Epoch {epoch+1}, Training Loss: {total_loss:.4f}")

    # Save checkpoint
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}.pt')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")


Epoch 1:  92%|█████████▏| 2292/2500 [19:53<01:49,  1.90it/s, loss=0.0105]