In [6]:
from datasets import load_dataset, concatenate_datasets

data = load_dataset("artem9k/ai-text-detection-pile")

print(data)

DatasetDict({
    train: Dataset({
        features: ['source', 'id', 'text'],
        num_rows: 1392522
    })
})


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base", num_labels=2
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [27]:
len(data["train"]) - 1240000

152522

In [7]:
def preprocess_fn(batch):
    labels = [1 if src == "human" else 0 for src in batch["source"]]
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )
    tokenized["label"] = labels
    return tokenized
train_data = data["train"]
data_human = train_data.select(range(152522))
data_ai = train_data.select(range(1240000, len(train_data)))
merged_data = concatenate_datasets([data_human,data_ai])
merged_data = merged_data.shuffle(seed=42)
preprocessed_dataset = merged_data.map(preprocess_fn, batched=True)
preprocessed_dataset = preprocessed_dataset.remove_columns(["id","source"])

Map:   0%|          | 0/305044 [00:00<?, ? examples/s]

In [37]:
len(preprocessed_dataset)

305044

In [13]:
import random
num = random.randint(0,len(preprocessed_dataset))
preprocessed_dataset[num]

{'text': 'Benign T-wave Inversion from HQMedEd on Vimeo . There are many etiologies of T-wave inversion.\xa0 We are most worried about ischemic T-wave inversion.\xa0 Wellens\' syndrome is particularly dangerous, as it signifies an unstable critical LAD stenosis.\xa0 I have several posts on this; here is one that shows the entire evolution . Another etiology is "Benign T-wave Inversion", which has long been recognized. I first saw it described in Chou\'s textbook.\xa0 It is a normal variant associated with early repolarization.\xa0 K. Wang recently studied it. \xa0 He reviewed ECGs from all 11,424 patients who had at least one recorded during 2007 at Hennepin County Medical Center (where I work) and set aside the 101 cases of benign T-wave inversion.\xa0 97 were black.\xa0 3.7% of black men and\xa0 1% of black women had this finding.\xa0 1 of 5099 white patients had it.\xa0 Aside from an 8.8% incidence (9 of 109) black males aged 17-19, it was evenly distributed by age group. I have rev

In [14]:
# Lets just use 100k rows for this task and split(80/10/10)
split = preprocessed_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_val = split["train"].train_test_split(test_size=0.1, shuffle=True, seed=42)

train_dataset = train_val["train"]
test_dataset = split["test"]
val_dataset = train_val["test"]

In [10]:
from datasets import DatasetDict

actual_data = DatasetDict(
    {"train": train_dataset, "val": val_dataset, "test": test_dataset}
)

In [11]:
actual_data.push_to_hub(repo_id="optimization-hashira/ai-text-detection-dataset")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  28%|##7       | 66.6MB /  240MB            

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  28%|##7       | 66.7MB /  241MB            

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  28%|##7       | 66.7MB /  240MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|#########9| 79.8MB / 80.2MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  33%|###3      | 66.7MB /  199MB            

CommitInfo(commit_url='https://huggingface.co/datasets/optimization-hashira/ai-text-detection-dataset/commit/953ecda8458d2b2ebc4f5e752f40f311ecffdeee', commit_message='Upload dataset', commit_description='', oid='953ecda8458d2b2ebc4f5e752f40f311ecffdeee', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/optimization-hashira/ai-text-detection-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='optimization-hashira/ai-text-detection-dataset'), pr_revision=None, pr_num=None)

In [24]:
preprocessed_dataset.save_to_disk(dataset_path="./dataset")

Saving the dataset (0/4 shards):   0%|          | 0/305044 [00:00<?, ? examples/s]

In [15]:
len(train_dataset) / len(preprocessed_dataset), len(test_dataset) / len(
    preprocessed_dataset
), len(val_dataset) / len(preprocessed_dataset)

(0.7199977708133909, 0.20000065564312033, 0.0800015735434888)

In [1]:
# pred metrics
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    accuracy = accuracy_score(labels, preds)

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_path = "optimization-hashira/roberta-ai-text-detector"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
from datasets import load_dataset

dataset_path = "optimization-hashira/ai-text-detection-dataset"

dataset = load_dataset(dataset_path)

train_dataset = dataset["train"]
test_dataset = dataset["test"]
val_dataset = dataset["val"]

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=50,
    report_to="wandb",
    run_name="ai_text_detector"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# trainer.train()

  trainer = Trainer(


In [8]:
from huggingface_hub import HfApi

api = HfApi()
# api.create_repo(repo_id="optimization-hashira/roberta-ai-text-detector", private=True)
# api.upload_folder(
#     folder_path=model_path, repo_id="optimization-hashira/roberta-ai-text-detector",commit_message="first commit"
# )

api.create_repo(repo_id="optimization-hashira/ai-text-detection-dataset",repo_type="dataset",private=True)
api.upload_folder(repo_id="optimization-hashira/ai-text-detection-dataset",
                  folder_path=dataset_path,
                  repo_type="dataset",
                  commit_message="first commit"
                  )

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...lit/train/data-00002-of-00003.arrow:   2%|2         | 11.0MB /  468MB            

  ...lit/train/data-00001-of-00003.arrow:   0%|          | 1.82MB /  469MB            

  ...split/val/data-00000-of-00001.arrow:   1%|          |  867kB /  157MB            

  ...lit/train/data-00000-of-00003.arrow:   0%|          |  969kB /  469MB            

  ...plit/test/data-00000-of-00001.arrow:  15%|#4        | 58.3MB /  390MB            

CommitInfo(commit_url='https://huggingface.co/datasets/optimization-hashira/ai-text-detection-dataset/commit/d921c228355189b5a5762afd773ae81adaff8248', commit_message='first commit', commit_description='', oid='d921c228355189b5a5762afd773ae81adaff8248', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/optimization-hashira/ai-text-detection-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='optimization-hashira/ai-text-detection-dataset'), pr_revision=None, pr_num=None)

In [None]:
results = trainer.evaluate(test_dataset)
print(results)

In [None]:
{
    "eval_loss": 0.07999948412179947,
    "eval_model_preparation_time": 0.002,
    "eval_accuracy": 0.9896080906095822,
    "eval_f1": 0.989469488090888,
    "eval_precision": 0.9990943548116593,
    "eval_recall": 0.980028295989208,
    "eval_runtime": 399.3875,
    "eval_samples_per_second": 152.756,
    "eval_steps_per_second": 9.55,
}