In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tokenized-d/tokenized_data.pt
/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/maindata1/final_train.csv
/kaggle/input/maindata1/final_test.csv


In [None]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm


def optimal_max_len(texts, tokenizer, percentile=0.95):
    token_lengths = [len(tokenizer.encode(text, add_special_tokens=True)) for text in texts]
    return int(np.percentile(token_lengths, percentile))

optimal_max_len = find_optimal_max_len(X_train, tokenizer)
print(f"Optimal maximum length: {optimal_max_len}")


# Tokenization function using BERT tokenizer
def bert_encode(texts, tokenizer, max_len=None):
    if max_len is None:
        max_len = optimal_max_len(texts, tokenizer)
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Tokenizing"):
        encoded = tokenizer.encode_plus(
            text, 
            max_length=max_len, 
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Improved training function with BERT model
def train(model, train_dataloader, val_dataloader, optimizer, scheduler, model_save_dir, epochs=4):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    best_val_accuracy = float('-inf')

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Training")

        for step, batch in enumerate(train_progress_bar):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            model.zero_grad()
            outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            train_progress_bar.set_postfix({'Loss': loss.item()})

        avg_train_loss = total_loss / len(train_dataloader)
        model.eval()
        val_accuracy = []
        val_loss = 0
        val_progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Validation")

        with torch.no_grad():
            for batch in val_progress_bar:
                batch_input_ids = batch[0].to(device)
                batch_input_mask = batch[1].to(device)
                batch_labels = batch[2].to(device)

                outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss
                val_loss += loss.item()

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                accuracy = accuracy_score(batch_labels.cpu().numpy(), predictions.cpu().numpy())
                val_accuracy.append(accuracy)

                val_progress_bar.set_postfix({'Loss': loss.item()})

        avg_val_loss = val_loss / len(val_dataloader)
        avg_val_accuracy = sum(val_accuracy) / len(val_accuracy)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f}")

        if avg_val_accuracy > best_val_accuracy:
            best
            best_val_accuracy = avg_val_accuracy
            # Check if save directory exists, if not, create it
            if not os.path.exists(model_save_dir):
                os.makedirs(model_save_dir)
            # Save the model
            torch.save(model.state_dict(), os.path.join(model_save_dir, 'best_model.pth'))

    print(f"Best Validation Accuracy: {best_val_accuracy}")



# Function to load model weights
def load_best_model(model_path):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

# Paths for datasets and models
dataset_path = '/kaggle/input/maindata1/final_train.csv'
tokenized_data_path = '/kaggle/input/tokenized-d/tokenized_data.pt'
model_save_path = '/kaggle/working'

# Load tokenizer and model
tokenizer_path = '/kaggle/working/tokenizer'
if os.path.exists(tokenizer_path):
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.save_pretrained(tokenizer_path)

model_path = f"{model_save_path}/best_model.pth"

if os.path.exists(model_path):
    model = BertForSequenceClassification.from_pretrained(model_path)
else:
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Load and preprocess data
if os.path.exists(tokenized_data_path):
    data = torch.load(tokenized_data_path)
    X_train_encoded, X_train_mask, y_train_tensor = data['train_input_ids'], data['train_attention_masks'], data['train_labels']
    X_val_encoded, X_val_mask, y_val_tensor = data['val_input_ids'], data['val_attention_masks'], data['val_labels']
else:
    train_essays = pd.read_csv(dataset_path)
    X = train_essays['text']
    y = train_essays['label']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_encoded, X_train_mask = bert_encode(X_train, tokenizer)
    X_val_encoded, X_val_mask = bert_encode(X_val, tokenizer)

    y_train_tensor = torch.tensor(y_train.values)
    y_val_tensor = torch.tensor(y_val.values)

    torch.save({
        'train_input_ids': X_train_encoded,
        'train_attention_masks': X_train_mask,
        'train_labels': y_train_tensor,
        'val_input_ids': X_val_encoded,
        'val_attention_masks': X_val_mask,
        'val_labels': y_val_tensor
    }, tokenized_data_path)

# Handling class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_tensor), y=y_train_tensor.numpy())
weights = torch.tensor(class_weights, dtype=torch.float)
train_sampler = WeightedRandomSampler(weights=weights[y_train_tensor.long()], num_samples=len(y_train_tensor), replacement=True)

train_dataset = TensorDataset(X_train_encoded, X_train_mask, y_train_tensor)
val_dataset = TensorDataset(X_val_encoded, X_val_mask, y_val_tensor)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 4)

# Function to save tokenized data
def save_tokenized_data(input_ids, attention_masks, labels, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    torch.save({
        'input_ids': input_ids,
        'attention_masks': attention_masks,
        'labels': labels
    }, file_path)

# Function to load tokenized data
def load_tokenized_data(file_path):
    data = torch.load(file_path)
    return data['input_ids'], data['attention_masks'], data['labels']

# Check if saved model and tokenized data exist, and load them
tokenized_train_path = '/kaggle/working/tokenized_train_data.pt'
tokenized_val_path = '/kaggle/working/tokenized_val_data.pt'

model_path = '/kaggle/working/best_model.pth'

# Check if the model file exists
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path))
    model.eval()
else:
    print("Model file not found. Training the model.")
    # Train the model
    train(model, train_dataloader, val_dataloader, optimizer, scheduler, '/kaggle/working', epochs=1)
    # Save the trained model
    torch.save(model.state_dict(), model_path)
# Load or preprocess and save tokenized data
if os.path.exists(tokenized_train_path) and os.path.exists(tokenized_val_path):
    X_train_encoded, X_train_mask, y_train_tensor = load_tokenized_data(tokenized_train_path)
    X_val_encoded, X_val_mask, y_val_tensor = load_tokenized_data(tokenized_val_path)
else:
    # Tokenize and save training data
    X_train_encoded, X_train_mask = bert_encode(X_train, tokenizer)
    y_train_tensor = torch.tensor(y_train.values)
    save_tokenized_data(X_train_encoded, X_train_mask, y_train_tensor, tokenized_train_path)

    # Tokenize and save validation data
    X_val_encoded, X_val_mask = bert_encode(X_val, tokenizer)
    y_val_tensor = torch.tensor(y_val.values)
    save_tokenized_data(X_val_encoded, X_val_mask, y_val_tensor, tokenized_val_path)

# Load the best model for inference
load_best_model(model, f"{model_path}/best_model.pth")
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_tensor), y=y_train_tensor.numpy())
weights = torch.tensor(class_weights, dtype=torch.float)
train_sampler = WeightedRandomSampler(weights=weights[y_train_tensor.long()], num_samples=len(y_train_tensor), replacement=True)

train_dataset = TensorDataset(X_train_encoded, X_train_mask, y_train_tensor)
val_dataset = TensorDataset(X_val_encoded, X_val_mask, y_val_tensor)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)



test_data_path = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv'
test_data = pd.read_csv(test_data_path)
X_test_encoded, X_test_mask = bert_encode(test_data['text'], tokenizer)
dummy_labels = torch.zeros(len(X_test_encoded))

test_dataset = TensorDataset(X_test_encoded, X_test_mask, dummy_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16)


def create_submission_file(model, dataloader, submission_file_path, test_data):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    predictions = []

    for batch in tqdm(dataloader, desc="Generating Predictions"):
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_input_mask)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
        predictions.extend(probs[:,1].detach().cpu().numpy())

    submission_df = pd.DataFrame({'id': test_data['id'], 'generated': predictions})
    submission_df.to_csv(submission_file_path, index=False)

# Call the function to create the submission file
create_submission_file(model, test_dataloader, '/kaggle/working/submission.csv', test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model file not found. Training the model.


Epoch 1/1 - Training:  75%|███████▍  | 12976/17349 [3:02:47<1:01:41,  1.18it/s, Loss=1.13e-5] 

## BERT with optimization for overfitting and reduced complexity of task

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight

# Paths for datasets and tokenizer
dataset_path = '/kaggle/input/maindata1/final_train.csv'
tokenizer_path = '/kaggle/working/tokenizer'

# Loading tokenizer
if os.path.exists(tokenizer_path):
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.save_pretrained(tokenizer_path)

# Loading and preprocessing data
train_essays = pd.read_csv(dataset_path)
X = train_essays['text']
y = train_essays['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Tokenization function using BERT tokenizer
def bert_encode(texts, tokenizer, max_len=128):  # Reduced max_len for smaller datasets
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            max_length=max_len,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Tokenizing the data
X_train_encoded, X_train_mask = bert_encode(X_train, tokenizer)
X_val_encoded, X_val_mask = bert_encode(X_val, tokenizer)

# Converting labels to tensors
y_train_tensor = torch.tensor(y_train.values)
y_val_tensor = torch.tensor(y_val.values)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_encoded, X_train_mask, y_train_tensor)
val_dataset = TensorDataset(X_val_encoded, X_val_mask, y_val_tensor)


In [None]:
from torch.utils.data import DataLoader, SequentialSampler, WeightedRandomSampler
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import os

# Handling class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_tensor), y=y_train_tensor.numpy())
weights = torch.tensor(class_weights, dtype=torch.float)
train_sampler = WeightedRandomSampler(weights=weights[y_train_tensor.long()], num_samples=len(y_train_tensor), replacement=True)

# Creating DataLoaders
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

# Initializing BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(np.unique(y_train_tensor)))

# Model save path
model_save_path = '/kaggle/working/best_model.pth'

# Checking if the model is already trained and saved
if os.path.exists(model_save_path):
    model.load_state_dict(torch.load(model_save_path))
else:
    # Initializing optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)  # Small learning rate for fine-tuning
    total_steps = len(train_dataloader) * 4  
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
import torch.nn as nn
from tqdm import tqdm
import copy

def train(model, train_dataloader, val_dataloader, optimizer, scheduler, model_save_dir, epochs=4, patience=2):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    best_val_loss = float('inf')
    no_improve_epochs = 0  # Counter for early stopping
    best_model = copy.deepcopy(model.state_dict())  # To save the best model

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            model.zero_grad()
            outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)

        # Validation phase
        model.eval()
        val_loss = 0

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                batch_input_ids = batch[0].to(device)
                batch_input_mask = batch[1].to(device)
                batch_labels = batch[2].to(device)

                outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_dataloader)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

        # Checking if validation loss improved
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            no_improve_epochs = 0
            best_model = copy.deepcopy(model.state_dict())
            # Save the best model
            if not os.path.exists(model_save_dir):
                os.makedirs(model_save_dir)
            torch.save(model.state_dict(), os.path.join(model_save_dir, 'best_model.pth'))
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                print("Early stopping triggered.")
                break

    # Load the best model before returning
    model.load_state_dict(best_model)
    return model

if not os.path.exists(model_save_path):
    trained_model = train(model, train_dataloader, val_dataloader, optimizer, scheduler, '/kaggle/working', epochs=4, patience=2)


In [None]:
def create_submission_file(model, dataloader, submission_file_path):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Predictions"):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)

            outputs = model(batch_input_ids, attention_mask=batch_input_mask)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
            predictions.extend(probs[:,1].detach().cpu().numpy())

    submission_df = pd.DataFrame({'id': test_data['id'], 'generated': predictions})
    submission_df.to_csv(submission_file_path, index=False)

# Load the test data
test_data_path = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv'
test_data = pd.read_csv(test_data_path)

# Tokenize test data
X_test_encoded, X_test_mask = bert_encode(test_data['text'], tokenizer)
dummy_labels = torch.zeros(len(X_test_encoded))  # Dummy labels for dataloader

# Create test dataset and dataloader
test_dataset = TensorDataset(X_test_encoded, X_test_mask, dummy_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Generate submission file
submission_file_path = '/kaggle/working/submission.csv'
create_submission_file(trained_model, test_dataloader, submission_file_path)
