In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
import os
os.listdir('/kaggle/input/roberta/roberta')

In [None]:
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv', on_bad_lines='skip')

train_df = train_df.dropna(subset=['full_text'])
test_df = test_df.dropna(subset=['full_text'])

train_df['full_text'] = train_df['full_text'].astype(str)
test_df['full_text'] = test_df['full_text'].astype(str)

train_df['score'] = pd.to_numeric(train_df['score'], errors='coerce')
train_df = train_df.dropna(subset=['score'])

train_texts = train_df['full_text'].tolist()
train_labels = train_df['score'].tolist()
test_texts = test_df['full_text'].tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

tokenizer_dir = '/kaggle/input/roberta/roberta'
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_dir)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

model = RobertaForSequenceClassification.from_pretrained(tokenizer_dir, num_labels=1)

In [None]:
class EssayDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float).unsqueeze(0)
        return item

In [None]:
train_dataset = EssayDataset(train_encodings, train_labels)
val_dataset = EssayDataset(val_encodings, val_labels)
test_dataset = EssayDataset(test_encodings)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 5  # Assuming 5 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.MSELoss()

In [None]:
def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=5, patience=3):
    model.train()
    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits.squeeze(), inputs['labels'].squeeze())
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss}")

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val.to(device) for key, val in batch.items()}
                outputs = model(**inputs)
                loss = loss_fn(outputs.logits.squeeze(), inputs['labels'].squeeze())
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {avg_val_loss}")

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping triggered")
            break
        model.train()

In [None]:
train_model(model, train_loader, val_loader, optimizer, scheduler)


In [None]:
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        test_predictions.extend(outputs.logits.cpu().numpy())
        
test_predictions = [min(6, max(1, round(pred[0]))) for pred in test_predictions]
submission_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'score': test_predictions
})

submission_df.to_csv('submission.csv', index=False)