In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm

# Paths for datasets and models
dataset_path = '/kaggle/input/mit-dataset/final_train.csv'
model_save_path = '/kaggle/input/saved-model-roberta'
tokenized_data_path = '/kaggle/input/token-vals/tokenized_data.pt'

# Check and create model directory if not exists
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Load tokenizer
tokenizer_path = '/kaggle/input/tokenizer-2'
if os.path.exists(tokenizer_path):
    tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
else:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenizer.save_pretrained(tokenizer_path)


In [None]:
# Function to tokenize and encode the dataset using RoBERTa tokenizer
def roberta_encode(texts, tokenizer, max_len=512):
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Tokenizing"):
        encoded = tokenizer.encode_plus(
            text, 
            max_length=max_len, 
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Load and preprocess data
if os.path.exists(tokenized_data_path):
    tokenized_data = torch.load(tokenized_data_path)
    X_train_encoded, X_train_mask, y_train_tensor = tokenized_data['train_input_ids'], tokenized_data['train_attention_masks'], tokenized_data['train_labels']
    X_val_encoded, X_val_mask, y_val_tensor = tokenized_data['val_input_ids'], tokenized_data['val_attention_masks'], tokenized_data['val_labels']
else:
    df = pd.read_csv(dataset_path)
    X = df['text']
    y = df['label']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_encoded, X_train_mask = roberta_encode(X_train, tokenizer)
    X_val_encoded, X_val_mask = roberta_encode(X_val, tokenizer)

    y_train_tensor = torch.tensor(y_train.values)
    y_val_tensor = torch.tensor(y_val.values)

    # Save the tokenized data
    torch.save({
        'train_input_ids': X_train_encoded,
        'train_attention_masks': X_train_mask,
        'train_labels': y_train_tensor,
        'val_input_ids': X_val_encoded,
        'val_attention_masks': X_val_mask,
        'val_labels': y_val_tensor
    }, tokenized_data_path)

# Creating TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_encoded, X_train_mask, y_train_tensor)
val_dataset = TensorDataset(X_val_encoded, X_val_mask, y_val_tensor)

class_weights = compute_class_weight('balanced', classes=np.unique(y_train_tensor.numpy()), y=y_train_tensor.numpy())
weights = torch.tensor(class_weights, dtype=torch.float)
train_sampler = WeightedRandomSampler(weights=weights[y_train_tensor.long()], num_samples=len(y_train_tensor), replacement=True)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.cuda.amp as amp  # For mixed-precision training

# Function to save the model
def save_model(model, model_dir):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model.save_pretrained(model_dir)

# Function to initialize or load the model
def initialize_model(model_dir):
    if os.path.exists(model_dir) and "config.json" in os.listdir(model_dir):
        model = RobertaForSequenceClassification.from_pretrained(model_dir, num_labels=2)
    else:
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        save_model(model, model_dir)
    return model

model = initialize_model(model_save_path)

# Mixed-precision training setup
scaler = amp.GradScaler()

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 4) # Adjust epochs as necessary

# Function to train the model with mixed-precision
def train(model, train_dataloader, val_dataloader, optimizer, scheduler, scaler, device, model_dir, epochs=4):
    best_val_accuracy = float('-inf')

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Training")

        for step, batch in enumerate(train_progress_bar):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            model.zero_grad()

            with amp.autocast():
                outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()
            train_progress_bar.set_postfix({'Loss': loss.item()})

        avg_train_loss = total_loss / len(train_dataloader)
        avg_val_accuracy, avg_val_loss = evaluate(model, val_dataloader, device)

        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f}")
    print(f"Best Validation Accuracy: {best_val_accuracy}")


        # Check and save the best model
    if avg_val_accuracy > best_val_accuracy:
            best_val_accuracy = avg_val_accuracy
            save_model(model, model_dir)

# Function for evaluation
def evaluate(model, dataloader, device):
    model.eval()
    val_accuracy = []
    val_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            with amp.autocast():
                outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss
                logits = outputs.logits

            val_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            accuracy = accuracy_score(batch_labels.cpu().numpy(), predictions.cpu().numpy())
            val_accuracy.append(accuracy)

    avg_val_loss = val_loss / len(dataloader)
    avg_val_accuracy = sum(val_accuracy) / len(val_accuracy)
    return avg_val_accuracy, avg_val_loss


In [None]:
# Function to tokenize and encode the dataset using RoBERTa tokenizer
def roberta_encode(texts, tokenizer, max_len=512):
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Tokenizing"):
        encoded = tokenizer.encode_plus(
            text, 
            max_length=max_len, 
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Load and preprocess data
if os.path.exists(tokenized_data_path):
    tokenized_data = torch.load(tokenized_data_path)
    X_train_encoded, X_train_mask, y_train_tensor = tokenized_data['train_input_ids'], tokenized_data['train_attention_masks'], tokenized_data['train_labels']
    X_val_encoded, X_val_mask, y_val_tensor = tokenized_data['val_input_ids'], tokenized_data['val_attention_masks'], tokenized_data['val_labels']
else:
    df = pd.read_csv(dataset_path)
    X = df['text']
    y = df['label']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_encoded, X_train_mask = roberta_encode(X_train, tokenizer)
    X_val_encoded, X_val_mask = roberta_encode(X_val, tokenizer)

    y_train_tensor = torch.tensor(y_train.values)
    y_val_tensor = torch.tensor(y_val.values)

    # Save the tokenized data
    torch.save({
        'train_input_ids': X_train_encoded,
        'train_attention_masks': X_train_mask,
        'train_labels': y_train_tensor,
        'val_input_ids': X_val_encoded,
        'val_attention_masks': X_val_mask,
        'val_labels': y_val_tensor
    }, tokenized_data_path)

# Creating TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_encoded, X_train_mask, y_train_tensor)
val_dataset = TensorDataset(X_val_encoded, X_val_mask, y_val_tensor)

class_weights = compute_class_weight('balanced', classes=np.unique(y_train_tensor.numpy()), y=y_train_tensor.numpy())
weights = torch.tensor(class_weights, dtype=torch.float)
train_sampler = WeightedRandomSampler(weights=weights[y_train_tensor.long()], num_samples=len(y_train_tensor), replacement=True)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.cuda.amp as amp  # For mixed-precision training

# Function to initialize or load the model
def initialize_model(model_path):
    if os.path.exists(model_path):
        model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2) # Adjust num_labels based on your task
    else:
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) # Adjust num_labels based on your task
    return model

model = initialize_model(model_save_path)

# Mixed-precision training setup
scaler = amp.GradScaler()

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 4) # Adjust epochs if necessary

# Function to train the model with mixed-precision
def train(model, train_dataloader, val_dataloader, optimizer, scheduler, scaler, device, model_save_path, epochs):
    best_val_accuracy = float('-inf')

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Training")

        for step, batch in enumerate(train_progress_bar):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            model.zero_grad()

            with amp.autocast():
                outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()
            train_progress_bar.set_postfix({'Loss': loss.item()})

        avg_train_loss = total_loss / len(train_dataloader)
        avg_val_accuracy, avg_val_loss = evaluate(model, val_dataloader, device)

        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f}")

        if avg_val_accuracy > best_val_accuracy:
            best_val_accuracy = avg_val_accuracy
            model.save_pretrained(model_save_path)

    print(f"Best Validation Accuracy: {best_val_accuracy}")

# Function for evaluation
def evaluate(model, dataloader, device):
    model.eval()
    val_accuracy = []
    val_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            with amp.autocast():
                outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss
                logits = outputs.logits

            val_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            accuracy = accuracy_score(batch_labels.cpu().numpy(), predictions.cpu().numpy())
            val_accuracy.append(accuracy)

    avg_val_loss = val_loss / len(dataloader)
    avg_val_accuracy = sum(val_accuracy) / len(val_accuracy)
    return avg_val_accuracy, avg_val_loss


In [None]:
# Function to encode test data
def encode_test_data(test_texts, tokenizer):
    input_ids = []
    attention_masks = []

    for text in tqdm(test_texts, desc="Encoding Test Data"):
        encoded = tokenizer.encode_plus(
            text, 
            max_length=512,  # Adjust the max length as needed
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Function to create submission file
def create_submission_file(model, dataloader, submission_file_path, ids):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Predictions"):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)

            with amp.autocast():
                outputs = model(batch_input_ids, attention_mask=batch_input_mask)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
            predictions.extend(probs[:,1].detach().cpu().numpy())  # Adjust as per the specific task

    submission_df = pd.DataFrame({'id': ids, 'generated': predictions})
    submission_df.to_csv(submission_file_path, index=False)

# Loading test data and prepare for prediction
test_data_path = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv'  # Adjust path as needed
test_data = pd.read_csv(test_data_path)
X_test_encoded, X_test_mask = encode_test_data(test_data['text'], tokenizer)  # Adjust column name as needed
test_dataset = TensorDataset(X_test_encoded, X_test_mask)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Path for saving the submission file
submission_file_path = '/kaggle/working/submission.csv'

# Generating and save predictions in submission file
create_submission_file(model, test_dataloader, submission_file_path, test_data['id'])  # Adjust column name as needed
