In [13]:
model_names = [
    "distilbert-base-uncased",
    "roberta-base",
    "microsoft/deberta-v3-base",
    "bert-large-uncased",
    "roberta-large",
    "microsoft/deberta-v3-large"
]

In [10]:
import pandas as pd
import os

train_dir = "./data/train"

ref_df = pd.read_csv("./data/train.csv")
train_df = pd.DataFrame(columns=["text", "labels"])

for _, row in ref_df.iterrows():
    id = row["id"]
    real_text_id = row["real_text_id"]

    file_prefix = f"article_{id:04d}"

    file_path_dir = os.path.join(train_dir, file_prefix)

    file_1_path = os.path.join(file_path_dir, "file_1.txt")
    file_2_path = os.path.join(file_path_dir, "file_2.txt")

    with open(file_1_path, "r", encoding="utf-8") as f:
        file_1_text = f.read().strip()
    with open(file_2_path, "r", encoding="utf-8") as f:
        file_2_text = f.read().strip()

    if real_text_id == 1:
        train_df = pd.concat(
            [train_df, pd.DataFrame({"text": [file_1_text], "labels": [0]})],
            ignore_index=True,
        )
        train_df = pd.concat(
            [train_df, pd.DataFrame({"text": [file_2_text], "labels": [1]})],
            ignore_index=True,
        )
    else:
        train_df = pd.concat(
            [train_df, pd.DataFrame({"text": [file_1_text], "labels": [1]})],
            ignore_index=True,
        )
        train_df = pd.concat(
            [train_df, pd.DataFrame({"text": [file_2_text], "labels": [0]})],
            ignore_index=True,
        )

In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import torch.nn.functional as F
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import os
import pandas as pd

def prepare_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    return tokenizer, model


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = F.softmax(torch.tensor(logits), dim=1).detach().cpu().numpy()
    preds = np.argmax(probs, axis=1)

    probs_class1 = probs[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, probs_class1)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "aucroc": auc,
    }

def run_test(idx,model,tokenizer):

    test_df = pd.DataFrame(columns=["id", "real_text_id"])
    test_dir = "./data/test"

    for i in range(len(os.listdir(test_dir))):
        file_prefix = f"article_{i:04d}"
        file_path_dir = os.path.join(test_dir, file_prefix)

        file_1_path = os.path.join(file_path_dir, "file_1.txt")
        file_2_path = os.path.join(file_path_dir, "file_2.txt")

        with open(file_1_path, "r", encoding="utf-8") as f:
            file_1_text = f.read().strip()
        with open(file_2_path, "r", encoding="utf-8") as f:
            file_2_text = f.read().strip()

        inputs_1 = tokenizer(
            file_1_text, padding=True, truncation=True, max_length=512, return_tensors="pt"
        ).to("cuda")
        inputs_2 = tokenizer(
            file_2_text, padding=True, truncation=True, max_length=512, return_tensors="pt"
        ).to("cuda")

        inputs_1.pop("token_type_ids", None)
        inputs_2.pop("token_type_ids", None)

        with torch.no_grad():
            outputs_1 = model(**inputs_1)
            outputs_2 = model(**inputs_2)

        logits_1 = outputs_1["logits"]
        logits_2 = outputs_2["logits"]

        probs_1 = F.softmax(logits_1, dim=1).detach().cpu().numpy()
        probs_2 = F.softmax(logits_2, dim=1).detach().cpu().numpy()

        human_prob_file1 = probs_1[0][0]
        human_prob_file2 = probs_2[0][0]

        real_text_id = 1 if human_prob_file1 > human_prob_file2 else 2

        test_df = pd.concat(
            [
                test_df,
                pd.DataFrame({"id": [i], "human_prob_file1": [human_prob_file1], "human_prob_file2": [human_prob_file2], "real_text_id": [real_text_id]}),
            ],
            ignore_index=True,
        )

        test_df.to_csv(f"submission_{idx}.csv")

def train_model_and_test(idx,model_name):
    tokenizer, model = prepare_model_and_tokenizer(model_name)
    dataset = Dataset.from_pandas(train_df)
    data = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = data["train"]
    val_dataset = data["test"]

    def preprocess(batch):
        return tokenizer(
            batch["text"],
            padding=True,
            truncation=True,
            max_length=512,
        )

    train_dataset = train_dataset.map(preprocess, batched=True)
    val_dataset = val_dataset.map(preprocess, batched=True)

    training_args = TrainingArguments(
        output_dir=f"./{model_name}-kaggle",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        eval_strategy="epoch",
        eval_steps=None,
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        learning_rate=3e-5,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    run_test(idx,model,tokenizer)

In [18]:
for idx,model_name in enumerate(model_names):
    train_model_and_test(idx,model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Aucroc
1,0.3512,0.550647,0.736842,0.736842,0.875,0.636364,0.772727
2,0.3363,0.716975,0.736842,0.736842,0.875,0.636364,0.875
3,0.3259,0.789641,0.736842,0.736842,0.875,0.636364,0.863636


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Aucroc
1,0.5389,0.510267,0.736842,0.736842,0.875,0.636364,0.772727
2,0.4756,0.844009,0.736842,0.736842,0.875,0.636364,0.829545
3,0.4043,1.208025,0.736842,0.736842,0.875,0.636364,0.909091


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Aucroc
1,0.5376,0.530011,0.736842,0.736842,0.875,0.636364,0.806818
2,0.547,0.556788,0.736842,0.736842,0.875,0.636364,0.875
3,0.4723,0.630077,0.736842,0.736842,0.875,0.636364,0.784091


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Aucroc
1,0.336,0.817163,0.473684,0.615385,0.533333,0.727273,0.636364
2,0.2775,0.821242,0.736842,0.736842,0.875,0.636364,0.795455
3,0.3765,0.914039,0.736842,0.736842,0.875,0.636364,0.772727


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Aucroc
1,0.5073,0.789666,0.684211,0.7,0.777778,0.636364,0.659091
2,0.5088,0.755896,0.789474,0.777778,1.0,0.636364,0.806818
3,0.6974,1.643301,0.736842,0.736842,0.875,0.636364,0.840909


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Aucroc
1,0.4284,0.518875,0.736842,0.736842,0.875,0.636364,0.715909
2,0.3621,1.068425,0.736842,0.736842,0.875,0.636364,0.670455
3,0.4291,1.345046,0.736842,0.736842,0.875,0.636364,0.704545


In [16]:
import pandas as pd
from collections import Counter
import glob

csv_files = sorted(glob.glob("submission_*.csv"))

dfs = [pd.read_csv(file) for file in csv_files]

final_df = pd.DataFrame()
final_df["id"] = dfs[0]["id"]

def majority_vote(values):
    return Counter(values).most_common(1)[0][0]


predictions = pd.concat([df["real_text_id"] for df in dfs], axis=1)

final_df["real_text_id"] = predictions.apply(majority_vote, axis=1)

final_df.to_csv("final_submission.csv", index=False)

print("Voting complete. Saved to final_submission.csv")

Voting complete. Saved to final_submission.csv


In [None]:
csv_files = sorted(glob.glob("submission_*.csv"))
dfs = [pd.read_csv(file) for file in csv_files]

final_df = pd.DataFrame()
final_df["id"] = dfs[0]["id"]

prob_file1 = pd.concat([df["human_prob_file1"] for df in dfs], axis=1).mean(axis=1)
prob_file2 = pd.concat([df["human_prob_file2"] for df in dfs], axis=1).mean(axis=1)

final_df["human_prob_file1_avg"] = prob_file1
final_df["human_prob_file2_avg"] = prob_file2
final_df["real_text_id"] = (prob_file1 > prob_file2).astype(int) + 1  

final_df[["id", "real_text_id"]].to_csv("ensemble_submission.csv", index=False)

print("Ensemble complete! Final file saved as ensemble_submission.csv")

# worse than voting

Ensemble complete! Final file saved as ensemble_submission.csv
