In [None]:
%pip install scikit-learn torch --quiet

In [None]:
pip install pickle-mixin

In [None]:
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
import pickle
import numpy as np
import os
import pandas as pd

# Load preprocessed data
with open('dataset/preprocessed_data.pkl', 'rb') as f:
    preprocessed_data = pickle.load(f)

train_df = preprocessed_data['train_df']
test_df = preprocessed_data['test_df']
le = preprocessed_data['label_encoder']
cv_splits = preprocessed_data['cv_splits']

# Set up BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
max_length = 256

In [8]:
from torch.utils.data import Dataset

class IABDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

In [9]:
def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs, checkpoint_path=None):
    scaler = GradScaler()
    best_accuracy = 0
    start_epoch = 0
    
    if checkpoint_path and os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        best_accuracy = checkpoint['best_accuracy']
        print(f"Resuming training from epoch {start_epoch}")
    
    for epoch in range(start_epoch, num_epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
            
            train_loss += loss.item()
        
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_loader)}')
        
        # Validation
        model.eval()
        val_loss = 0
        predictions = []
        true_labels = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()
                
                _, preds = torch.max(outputs.logits, dim=1)
                predictions.extend(preds.cpu().tolist())
                true_labels.extend(labels.cpu().tolist())
        
        val_loss /= len(val_loader)
        accuracy = sum([1 for p, t in zip(predictions, true_labels) if p == t]) / len(predictions)
        print(f'Validation Loss: {val_loss}, Accuracy: {accuracy}')
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_model.pth')
        
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_accuracy': best_accuracy
        }, f'checkpoint_epoch_{epoch}.pth')
    
    return best_accuracy

def run_training(num_epochs, batch_size=16, learning_rate=1e-5, model_name='bert-large-uncased'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Use a larger BERT model
    tokenizer = BertTokenizer.from_pretrained(model_name)
    
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(cv_splits):
        print(f"Fold {fold + 1}/{len(cv_splits)}")
        
        train_texts = train_df['processed_text'].iloc[train_idx].tolist()
        train_labels = train_df['encoded_target'].iloc[train_idx].tolist()
        val_texts = train_df['processed_text'].iloc[val_idx].tolist()
        val_labels = train_df['encoded_target'].iloc[val_idx].tolist()
        
        train_dataset = IABDataset(train_texts, train_labels, tokenizer, max_length)
        val_dataset = IABDataset(val_texts, val_labels, tokenizer, max_length)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size*2, shuffle=False)

        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))
        model.to(device)

        # Use AdamW with weight decay
        optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
        
        # Use a warmup scheduler
        total_steps = len(train_loader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps//10, num_training_steps=total_steps)

        best_accuracy = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
        cv_scores.append(best_accuracy)

    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV score: {np.mean(cv_scores)}")

    # Train on full dataset
    full_dataset = IABDataset(train_df['processed_text'].tolist(), train_df['encoded_target'].tolist(), tokenizer, max_length)
    full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)

    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    total_steps = len(full_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps//10, num_training_steps=total_steps)

    train_model(model, full_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)

    # Load best model and predict on test set
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()

    test_dataset = IABDataset(test_df['processed_text'].tolist(), None, tokenizer, max_length)
    test_loader = DataLoader(test_dataset, batch_size=batch_size*2, shuffle=False)

    predictions = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Predicting on test set'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().tolist())

    submission = pd.DataFrame({
        'index': test_df['index'],
        'target': le.inverse_transform(predictions)
    })
    submission.to_csv('submission.csv', index=False)

    print("Submission file created.")

In [None]:
# Run the training with improved parameters
run_training(num_epochs=5, batch_size=16, learning_rate=2e-5, model_name='bert-large-uncased')

In [10]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd

def predict_on_test_set(model, test_df, tokenizer, le, batch_size=32, max_length=256, device='cuda'):
    model.eval()
    model.to(device)

    # Create test dataset
    test_dataset = IABDataset(test_df['processed_text'].tolist(), None, tokenizer, max_length)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Predicting on test set'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().tolist())

    # Convert numeric predictions back to original labels
    predicted_labels = le.inverse_transform(predictions)

    # Create submission DataFrame
    if 'Index' in test_df.columns:
        index_column = test_df['Index']
    elif 'index' in test_df.columns:
        index_column = test_df['index']
    else:
        index_column = [f"Article_{i}" for i in test_df.index]

    submission = pd.DataFrame({
        'target': predicted_labels,
        'Index': index_column
    })

    # Save submission to CSV
    submission.to_csv('submission.csv', index=False, quoting=1)  # quoting=1 ensures all fields are quoted
    print("Submission file created: submission.csv")

    return submission

# best_model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=len(le.classes_))
# best_model.load_state_dict(torch.load('best_model.pth'))
# 
# submission = predict_on_test_set(best_model, test_df, tokenizer, le)

In [None]:
# Load your best model
best_model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=len(le.classes_))
best_model.load_state_dict(torch.load('best_model.pth'))

# Make predictions
submission = predict_on_test_set(best_model, test_df, tokenizer, le)