In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/maindata1/final_train.csv
/kaggle/input/maindata1/final_test.csv
/kaggle/input/saved-model/config.json
/kaggle/input/saved-model/tokenizer_config.json
/kaggle/input/saved-model/model.safetensors
/kaggle/input/saved-model/special_tokens_map.json
/kaggle/input/saved-model/vocab.txt
/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/saved-values/tokenized_val_data.pt
/kaggle/input/saved-values/tokenized_train_data.pt


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from tqdm import tqdm

# Pre-download DistilBert model and tokenizer - Run this part with Internet "on"
if not os.path.exists('/kaggle/input/saved-model'):
    tokenizer = DistilBertTokenizer.from_pretrained('saved_model')
    model = DistilBertForSequenceClassification.from_pretrained('saved_model')
    tokenizer.save_pretrained('/kaggle/working/distilbert-base-uncased')
    model.save_pretrained('/kaggle/working/distilbert-base-uncased')

# Function to save tokenized data
def save_tokenized_data(input_ids, attention_masks, labels, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    torch.save({
        'input_ids': input_ids,
        'attention_masks': attention_masks,
        'labels': labels
    }, file_path)

# Function to load tokenized data
def load_tokenized_data(file_path):
    data = torch.load(file_path)
    return data['input_ids'], data['attention_masks'], data['labels']

# Modified bert_encode function with progress tracking
def bert_encode(texts, tokenizer, max_len=512):
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Tokenizing"):
        encoded = tokenizer.encode_plus(
            text, 
            max_length=max_len, 
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Function for training the model
def train(model, train_dataloader, val_dataloader, optimizer, save_path, epochs=4):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Training")
        for step, batch in enumerate(train_progress_bar):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            model.zero_grad()

            outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            train_progress_bar.set_postfix({'Loss': loss.item()})

        avg_train_loss = total_loss / len(train_dataloader)

        model.eval()
        val_accuracy = []
        val_loss = 0
        val_progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} - Validation")

        with torch.no_grad():
            for batch in val_progress_bar:
                batch_input_ids = batch[0].to(device)
                batch_input_mask = batch[1].to(device)
                batch_labels = batch[2].to(device)

                outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss
                val_loss += loss.item()

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                accuracy = accuracy_score(batch_labels.cpu().numpy(), predictions.cpu().numpy())
                val_accuracy.append(accuracy)

                val_progress_bar.set_postfix({'Loss': loss.item()})

        avg_val_loss = val_loss / len(val_dataloader)
        avg_val_accuracy = sum(val_accuracy) / len(val_accuracy)

        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f}")

    torch.save(model.state_dict(), f"{save_path}/model_final.pth")

# Function to load model weights
def load_weights(model, weights_path):
    model.load_state_dict(torch.load(weights_path))
    model.eval()

# Paths for datasets
dataset_path = '/kaggle/input/maindata1/final_train.csv'
tokenized_train_path = '/kaggle/input/saved-values/tokenized_train_data.pt'
tokenized_val_path = '/kaggle/input/saved-values/tokenized_val_data.pt'

# Tokenizer and model loading from the saved files
tokenizer = DistilBertTokenizer.from_pretrained('/kaggle/input/saved-model')
model = DistilBertForSequenceClassification.from_pretrained('/kaggle/input/saved-model')

# Check if tokenized data exists
if not os.path.exists(tokenized_train_path) or not os.path.exists(tokenized_val_path):
    print("Tokenizing...")
    train_essays = pd.read_csv(dataset_path)
    X = train_essays['text']
    y = train_essays['label']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_encoded, X_train_mask = bert_encode(X_train, tokenizer)
    X_val_encoded, X_val_mask = bert_encode(X_val, tokenizer)

    y_train_tensor = torch.tensor(y_train.values)
    y_val_tensor = torch.tensor(y_val.values)

    save_tokenized_data(X_train_encoded, X_train_mask, y_train_tensor, tokenized_train_path)
    save_tokenized_data(X_val_encoded, X_val_mask, y_val_tensor, tokenized_val_path)
else:
    print("Loading tokenized data...")
    X_train_encoded, X_train_mask, y_train_tensor = load_tokenized_data(tokenized_train_path)
    X_val_encoded, X_val_mask, y_val_tensor = load_tokenized_data(tokenized_val_path)

class_weights = compute_class_weight('balanced', classes=np.unique(y_train_tensor), y=y_train_tensor.numpy())
weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = weights[y_train_tensor.long()]
train_sampler = WeightedRandomSampler(weights=class_weights, num_samples=len(class_weights), replacement=True)

train_dataset = TensorDataset(X_train_encoded, X_train_mask, y_train_tensor)
val_dataset = TensorDataset(X_val_encoded, X_val_mask, y_val_tensor)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

optimizer = AdamW(model.parameters(), lr=2e-5)

train(model, train_dataloader, val_dataloader, optimizer, "/kaggle/working", epochs=1)

# Test Data Processing and Submission File Creation
test_data_path = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv'
test_data = pd.read_csv(test_data_path)
X_test_encoded, X_test_mask = bert_encode(test_data['text'], tokenizer)
dummy_labels = torch.zeros(len(X_test_encoded))

test_dataset = TensorDataset(X_test_encoded, X_test_mask, dummy_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16)

def create_submission_file(model, dataloader, submission_file_path, test_data):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    predictions = []

    for batch in tqdm(dataloader, desc="Generating Predictions"):
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_input_mask)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
        predictions.extend(probs[:,1].detach().cpu().numpy())

    submission_df = pd.DataFrame({'id': test_data['id'], 'generated': predictions})
    submission_df.to_csv(submission_file_path, index=False)

create_submission_file(model, test_dataloader, '/kaggle/working/submission.csv', test_data)




Loading tokenized data...


Epoch 1/1 - Training: 100%|██████████| 17349/17349 [2:05:36<00:00,  2.30it/s, Loss=0.000771]
Epoch 1/1 - Validation: 100%|██████████| 4338/4338 [10:22<00:00,  6.97it/s, Loss=0.000178]


Epoch 1/1 - Training Loss: 0.0137, Validation Loss: 0.0040, Validation Accuracy: 0.9990


Tokenizing:   0%|          | 0/3 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Tokenizing: 100%|██████████| 3/3 [00:00<00:00, 472.65it/s]
Generating Predictions: 100%|██████████| 1/1 [00:00<00:00, 34.21it/s]
