In [None]:
# Step 1: Install Required Libraries
!pip install transformers torch_xla

# Step 2: Imports
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import GradScaler, autocast
from sklearn import metrics
import torch_xla
import torch_xla.core.xla_model as xm

# Step 3: Load Data
train_data = pd.read_csv('/kaggle/input/trial-fibe1/dataset/train.csv', encoding='ISO-8859-1')
test_data = pd.read_csv('/kaggle/input/trial-fibe1/dataset/test.csv', encoding='ISO-8859-1')
X = train_data['text']
y = train_data['target']

# Step 4: Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 5: Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_and_clean(texts):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
    # Clean up tokenization spaces
    encodings['input_ids'] = [tokenizer.clean_up_tokenization_spaces(ids) for ids in encodings['input_ids']]
    return encodings

train_encodings = tokenize_and_clean(list(X_train))
val_encodings = tokenize_and_clean(list(X_val))

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

train_inputs = torch.tensor(train_encodings['input_ids'], dtype=torch.long)
val_inputs = torch.tensor(val_encodings['input_ids'], dtype=torch.long)

# Step 6: Create Custom Dataset
class NewsDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

train_dataset = NewsDataset(train_inputs, y_train_encoded)
val_dataset = NewsDataset(val_inputs, y_val_encoded)

# Step 7: DataLoader Creation with multi-threading
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=12)  # Use multiple workers
val_loader = DataLoader(val_dataset, batch_size=8, num_workers=12)

# Step 8: Load Model
device = xm.xla_device()
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(le.classes_)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Step 9: Checkpoint Functions
checkpoint_dir = '/kaggle/working/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

def save_checkpoint(model, optimizer, epoch, loss, filename='checkpoint.pth'):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'loss': loss,
    }
    torch.save(checkpoint, os.path.join(checkpoint_dir, filename))

# Load the checkpoint if available (load on CPU first)
checkpoint_path = '/kaggle/input/epoch11/checkpoint_epoch_1 (1).pth'

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'], strict=False)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
else:
    print(f"No checkpoint found at {checkpoint_path}. Starting from scratch.")
    start_epoch = 0  # Start from epoch 0

num_epochs = 1  # Change this to the number of additional epochs you want to run
scaler = GradScaler()
accumulation_steps = 4  # Adjust as needed

# Step 10: Training Loop
for epoch in range(start_epoch, start_epoch + num_epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        with autocast():
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss / accumulation_steps
        
        total_loss += loss.item()
        
        # Backward pass
        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            xm.mark_step()  # Ensure proper synchronization

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
    
    # Save checkpoint every epoch
    save_checkpoint(model, optimizer, epoch, total_loss / len(train_loader), filename=f'checkpoint_epoch_{epoch + 1}.pth')

# Step 11: Model Evaluation
model.eval()
total_accuracy = 0
total_samples = 0
with torch.no_grad():
    for batch in val_loader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
        total_accuracy += (predictions == labels).sum().item()
        total_samples += labels.size(0)

val_accuracy = total_accuracy / total_samples
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 12: Predictions on Test Data in smaller chunks to avoid memory issues
batch_size = 8  # You can adjust this as needed
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=512)
test_inputs = torch.tensor(test_encodings['input_ids'], dtype=torch.long).to(device)

test_predictions = []
with torch.no_grad():
    for i in range(0, len(test_inputs), batch_size):
        batch = test_inputs[i:i + batch_size]
        outputs = model(batch)
        predictions = torch.argmax(outputs.logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())  # Move to CPU and extend list

# Step 13: Prepare Submission
submission = pd.DataFrame({'Index': test_data['Index'], 'target': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

# Step 14: Calculate and Print F1 Score
y_val_pred = []
with torch.no_grad():
    for batch in val_loader:
        inputs, labels = batch
        inputs = inputs.to(device)

        outputs = model(inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
        y_val_pred.extend(predictions.cpu().numpy())

score = metrics.f1_score(y_val_encoded, y_val_pred, average='weighted')
print(f"F1 Score: {score:.2f}")
