In [1]:
import pandas as pd
import os

train_dir = './data/train'

ref_df = pd.read_csv('./data/train.csv')
train_df = pd.DataFrame(columns=['text','labels'])

for _, row in ref_df.iterrows():
    id = row['id']
    real_text_id = row['real_text_id']
    
    file_prefix = f'article_{id:04d}'
    
    file_path_dir = os.path.join(train_dir, file_prefix)

    file_1_path = os.path.join(file_path_dir, 'file_1.txt')
    file_2_path = os.path.join(file_path_dir, "file_2.txt")
    
    with open(file_1_path, 'r', encoding='utf-8') as f:
        file_1_text = f.read().strip()
    with open(file_2_path, 'r', encoding='utf-8') as f:
        file_2_text = f.read().strip()
    
    if real_text_id == 1:
        train_df = pd.concat([train_df, pd.DataFrame({'text': [file_1_text], 'labels': [0]})], ignore_index=True)
        train_df = pd.concat([train_df, pd.DataFrame({'text': [file_2_text], 'labels': [1]})], ignore_index=True)
    else:
        train_df = pd.concat([train_df, pd.DataFrame({'text': [file_1_text], 'labels': [1]})], ignore_index=True)
        train_df = pd.concat([train_df, pd.DataFrame({'text': [file_2_text], 'labels': [0]})], ignore_index=True)
        
        
train_df.head()

Unnamed: 0,text,labels
0,The VIRSA (Visible Infrared Survey Telescope A...,0
1,The China relay network has released a signifi...,1
2,China\nThe goal of this project involves achie...,1
3,The project aims to achieve an accuracy level ...,0
4,Scientists can learn about how galaxies form a...,0


In [20]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel


class DesklibAIDetectionModel(PreTrainedModel):
    config_class = AutoConfig

    def __init__(self, config):
        super().__init__(config)
        # Initialize the base transformer model.
        self.model = AutoModel.from_config(config)
        # Define a classifier head.
        self.classifier = nn.Linear(config.hidden_size, 1)
        # Initialize weights (handled by PreTrainedModel)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Forward pass through the transformer
        outputs = self.model(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0]
        # Mean pooling
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        )
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
        sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
        pooled_output = sum_embeddings / sum_mask

        # Classifier
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1), labels.float())

        output = {"logits": logits}
        if loss is not None:
            output["loss"] = loss
        return output

In [4]:
from transformers import AutoTokenizer
model_name = "microsoft/deberta-v3-large"

config = AutoConfig.from_pretrained(model_name)
model = DebertaAIDetectionModel(config)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert/distilroberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(
    "openai-community/roberta-base-openai-detector"
)
model = AutoModelForSequenceClassification.from_pretrained(
    "openai-community/roberta-base-openai-detector"
)

In [3]:
from datasets import Dataset

dataset = Dataset.from_pandas(train_df)
data = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = data['train']
val_dataset = data['test']

def preprocess(batch):
    return tokenizer(
        batch['text'],
        padding=True,
        truncation=True,
        max_length=512,
    )
train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

In [5]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import torch
from torch.nn import functional as F

training_args = TrainingArguments(
    output_dir="./debert-kaggle",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="epoch",
    eval_steps=None,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  
    learning_rate=3e-5,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    report_to="none",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # probs = torch.sigmoid(torch.tensor(logits)).numpy()
    # preds = (probs >= 0.5).astype(int)
    probs = F.softmax(torch.tensor(logits), dim=1).detach().cpu().numpy()
    preds = np.argmax(probs, axis=1)

    probs_class1 = probs[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, probs_class1)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'aucroc' : auc
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Aucroc
1,0.733,0.753852,0.631579,0.666667,0.7,0.636364,0.659091
2,0.6635,0.800013,0.736842,0.736842,0.875,0.636364,0.681818
3,0.6309,1.216903,0.736842,0.736842,0.875,0.636364,0.761364


TrainOutput(global_step=129, training_loss=0.7982987204725428, metrics={'train_runtime': 59.5071, 'train_samples_per_second': 8.621, 'train_steps_per_second': 2.168, 'total_flos': 134975971399680.0, 'train_loss': 0.7982987204725428, 'epoch': 3.0})

In [7]:
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os

test_df = pd.DataFrame(columns=["id", "real_text_id"])
test_dir = "./data/test"


for i in range(len(os.listdir(test_dir))):
    file_prefix = f"article_{i:04d}"
    file_path_dir = os.path.join(test_dir, file_prefix)

    file_1_path = os.path.join(file_path_dir, "file_1.txt")
    file_2_path = os.path.join(file_path_dir, "file_2.txt")

    with open(file_1_path, "r", encoding="utf-8") as f:
        file_1_text = f.read().strip()
    with open(file_2_path, "r", encoding="utf-8") as f:
        file_2_text = f.read().strip()

    inputs_1 = tokenizer(
        file_1_text, padding=True, truncation=True, max_length=512, return_tensors="pt"
    ).to("cuda")
    inputs_2 = tokenizer(
        file_2_text, padding=True, truncation=True, max_length=512, return_tensors="pt"
    ).to("cuda")

    inputs_1.pop("token_type_ids", None)
    inputs_2.pop("token_type_ids", None)

    with torch.no_grad():
        outputs_1 = model(**inputs_1)
        outputs_2 = model(**inputs_2)

    logits_1 = outputs_1['logits']
    logits_2 = outputs_2['logits']

    # probs_1 = torch.sigmoid(logits_1).detach().cpu().numpy()
    # probs_2 = torch.sigmoid(logits_2).detach().cpu().numpy()

    probs_1 = F.softmax(logits_1, dim=1).detach().cpu().numpy()
    probs_2 = F.softmax(logits_2, dim=1).detach().cpu().numpy()

    human_prob_file1 = probs_1[0][0]
    human_prob_file2 = probs_2[0][0]

    real_text_id = 1 if human_prob_file1 > human_prob_file2 else 2

    test_df = pd.concat(
        [test_df, pd.DataFrame({"id": [i], "real_text_id": [real_text_id]})],
        ignore_index=True,
    )


print(test_df.head())

  id real_text_id
0  0            1
1  1            2
2  2            1
3  3            2
4  4            2


In [8]:
test_df.to_csv("submission_roberta_openai.csv", index=False)