In [4]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Constants
MAX_LEN = 128
BATCH_SIZE = 16
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

# Custom Dataset
class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Teacher Model (BERT)
class TeacherModel(nn.Module):
    def __init__(self, num_classes):
        super(TeacherModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]  # [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits, pooled_output

def precompute_bert_outputs(model, data_loader, device, save_dir):
    model.eval()
    all_logits = []
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Precomputing BERT outputs'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            logits, features = model(input_ids, attention_mask)
            
            all_logits.append(logits.cpu().numpy())
            all_features.append(features.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            
            # Clear GPU memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    # Concatenate all batches
    all_logits = np.concatenate(all_logits, axis=0)
    all_features = np.concatenate(all_features, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    
    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save precomputed outputs
    np.save(os.path.join(save_dir, 'bert_logits.npy'), all_logits)
    np.save(os.path.join(save_dir, 'bert_features.npy'), all_features)
    np.save(os.path.join(save_dir, 'labels.npy'), all_labels)
    
    print(f"Saved precomputed outputs to {save_dir}")
    print(f"Logits shape: {all_logits.shape}")
    print(f"Features shape: {all_features.shape}")
    print(f"Labels shape: {all_labels.shape}")



cuda


In [5]:
# Load data
train_df = pd.read_csv('/kaggle/input/news-dataset/final_news_train.csv')
test_df = pd.read_csv('/kaggle/input/news-dataset/final_news_test.csv')

# Split train into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].values,
    train_df['label'].values,
    test_size=0.1,
    random_state=42,
    stratify=train_df['label'].values
)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = BertDataset(
    texts=train_texts,
    labels=train_labels,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_dataset = BertDataset(
    texts=val_texts,
    labels=val_labels,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = BertDataset(
    texts=test_df['text'].values,
    labels=test_df['label'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True)

# Initialize teacher model
teacher_model = TeacherModel(num_classes=4).to(DEVICE)

# Load pre-trained BERT weights
teacher_model.load_state_dict(torch.load('/kaggle/input/bert_classifier/pytorch/default/1/bert_classifier.pth'))
print("Teacher model loaded successfully!")

# Precompute and save BERT outputs for training data
print("\nPrecomputing BERT outputs for training data...")
precompute_bert_outputs(teacher_model, train_loader, DEVICE, '/kaggle/working/precomputed_bert/train')

# Precompute and save BERT outputs for validation data
print("\nPrecomputing BERT outputs for validation data...")
precompute_bert_outputs(teacher_model, val_loader, DEVICE, '/kaggle/working/precomputed_bert/val')

# Precompute and save BERT outputs for test data
print("\nPrecomputing BERT outputs for test data...")
precompute_bert_outputs(teacher_model, test_loader, DEVICE, '/kaggle/working/precomputed_bert/test')



Teacher model loaded successfully!

Precomputing BERT outputs for training data...


Precomputing BERT outputs:  60%|██████    | 7731/12823 [16:53<11:07,  7.63it/s]


KeyboardInterrupt: 

In [6]:
!zip -r /kaggle/working/output.zip /kaggle/working/precomputed_bert

  adding: kaggle/working/precomputed_bert/ (stored 0%)
  adding: kaggle/working/precomputed_bert/test/ (stored 0%)
  adding: kaggle/working/precomputed_bert/test/bert_logits.npy (deflated 8%)
  adding: kaggle/working/precomputed_bert/test/bert_features.npy (deflated 8%)
  adding: kaggle/working/precomputed_bert/test/labels.npy (deflated 94%)
  adding: kaggle/working/precomputed_bert/train/ (stored 0%)
  adding: kaggle/working/precomputed_bert/train/bert_logits.npy (deflated 8%)
  adding: kaggle/working/precomputed_bert/train/bert_features.npy (deflated 8%)
  adding: kaggle/working/precomputed_bert/train/labels.npy (deflated 94%)
  adding: kaggle/working/precomputed_bert/val/ (stored 0%)
  adding: kaggle/working/precomputed_bert/val/bert_logits.npy (deflated 8%)
  adding: kaggle/working/precomputed_bert/val/bert_features.npy (deflated 8%)
  adding: kaggle/working/precomputed_bert/val/labels.npy (deflated 94%)
