In [36]:
import pandas as pd
import numpy as np

np.random.seed(42)

In [37]:
test_df = pd.read_csv('../datasets/test_essays.csv')
submission_df = pd.read_csv('../datasets/sample_submission.csv')
train_df = pd.read_csv("../datasets/train_v2_drcat_02.csv")

In [38]:
train_df = train_df.drop(columns=['prompt_name', 'source', 'RDizzl3_seven'])
train_df

Unnamed: 0,text,label
0,Phones\n\nModern humans today are always on th...,0
1,This essay will explain if drivers should or s...,0
2,Driving while the use of cellular devices\n\nT...,0
3,Phones & Driving\n\nDrivers should not be able...,0
4,Cell Phone Operation While Driving\n\nThe abil...,0
...,...,...
43479,There has been a fuss about the Elector Colleg...,0
43480,Limiting car usage has many advantages. Such a...,0
43481,There's a new trend that has been developing f...,0
43482,As we all know cars are a big part of our soci...,0


In [14]:
test_df = test_df.drop(columns='prompt_id')
test_df

Unnamed: 0,id,text
0,0000aaaa,Aaa bbb ccc.
1,1111bbbb,Bbb ccc ddd.
2,2222cccc,CCC ddd eee.


# Step 1. Text Preprocessing

In [10]:
train_df["generated"] = train_df["label"].apply(lambda x: 1.0 if x == 1 else 0.0)
train_df["human"] = train_df["label"].apply(lambda x: 1.0 if x == 0 else 0.0)

# Step 2. Modeling

In [11]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(train_df, test_size=0.30, random_state=42, shuffle=True, stratify=train_df["label"])

In [14]:
train.to_csv("train.csv")
val.to_csv("val.csv")
test_df.to_csv("test.csv")

# Model Training

In [15]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification
from transformers import PreTrainedTokenizerFast
from transformers import Trainer, TrainingArguments
from datasets import load_metric
from transformers import EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)

LABELS = ['generated', 'human']
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

def read_csv_with_labels(filename):
    data = pd.read_csv(filename)
    texts = data['text'].tolist()
    labels = data[LABELS].values

    return texts, labels

def read_csv_without_labels(filename):
    data = pd.read_csv(filename)
    texts = data['text'].tolist()

    return texts

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
            'roc_auc': roc_auc,
            'accuracy': accuracy}
    return metrics

def create_tokenizer():
    raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    raw_tokenizer.normalizer = normalizers.NFC()
    raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    
    VOCAB_SIZE = 30522
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
          
    raw_tokenizer.train_from_iterator(test_df[['text']].values, trainer=trainer)
    
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

class LLMDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }

        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
class LLMDTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }

        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


class ClassificationTrainer():
    def __init__(self,
                 pretrained_transformer_name='distilbert-base-cased',
                 dataset_dct={'train':'train.csv', 'val': 'val.csv', 'test':'test.csv'},
                 warmup_steps=500,
                 num_train_epochs=3):

        train_texts, train_labels = read_csv_with_labels(dataset_dct['train'])
        val_texts, val_labels = read_csv_with_labels(dataset_dct['val'])
        test_texts = read_csv_without_labels(dataset_dct['test'])

        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_transformer_name)
        self.tokenizer.save_pretrained("/tokenizer")

        train_encodings = self.tokenizer(train_texts, truncation=True, max_length=256, padding=True)
        val_encodings = self.tokenizer(val_texts, truncation=True, max_length=256, padding=True)
        test_encodings = self.tokenizer(test_texts, truncation=True, max_length=256, padding=True)

        self.train_dataset = LLMDDataset(train_encodings, train_labels)
        self.val_dataset = LLMDDataset(val_encodings, val_labels)
        self.test_dataset = LLMDTestDataset(test_encodings)

        self.model = AutoModelForSequenceClassification.from_pretrained(
                pretrained_transformer_name, num_labels=len(LABELS), problem_type="multi_label_classification",  id2label=id2label, label2id=label2id)

        self.metric = {metric:load_metric(metric) for metric in ['f1', 'precision', 'recall', 'accuracy']}

        self.training_args = TrainingArguments(
            output_dir='./results',  # output directory
            num_train_epochs=num_train_epochs, # total number of training epochs
            per_device_train_batch_size=
            64,  # batch size per device during training
            per_device_eval_batch_size=64,  # batch size for evaluation
            warmup_steps=
            warmup_steps,  # number of warmup steps for learning rate scheduler
            weight_decay=0.01,  # strength of weight decay
            logging_dir='./logs',  # directory for storing logs
            logging_strategy='epoch',
            evaluation_strategy='epoch',
            save_strategy='epoch',
            save_total_limit = 3,
        )

        self.trainer = Trainer(
            model=self.model,  # the instantiated 🤗 Transformers model to be trained
            args=self.training_args,  # training arguments, defined above
            train_dataset=self.train_dataset,  # training dataset
            eval_dataset=self.val_dataset,  # evaluation dataset
            compute_metrics=self.compute_metrics,
        )


    def compute_metrics(self, p: EvalPrediction):
        predictions = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        result = multi_label_metrics(
            predictions=predictions,
            labels=p.label_ids
        )
        return result


    def inference(self):
        predictions = self.trainer.predict(self.test_dataset, metric_key_prefix="predict").predictions
        predictions = np.argmax(predictions, axis=1)

        return predictions

In [None]:
classification_trainer = ClassificationTrainer(
    pretrained_transformer_name='cointegrated/rubert-tiny2',
    dataset_dct={'train':'train.csv', 'val': 'val.csv', 'test': 'test.csv'},
    warmup_steps=100,
    num_train_epochs=3
)

classification_trainer.trainer.train()

Gasoline stored in the fuel tank of a vehicle can escape from the vehicle and pollute the environment, even when the vehicle is not running. This occurs because gasoline is volatile and can change from liquid to gas, which can pass into the air. Evaporated gasoline escaping from fuel tanks is a significant source of environmental pollution with volatile organic compounds (VOCs), which can harm the environment and human health. To prevent this leakage of gasoline, modern vehicles are equipped with a canister packed with particles of activated carbon, which captures the gasoline molecules in a maze of carbon molecules. Activated carbon is a charcoal material widely used for the purification of drinking water and natural gas. The adsorption of evaporated gasoline on activated carbon can be compared to the Labyrinth of the Minotaur. The labyrinth passages must be cleaned so that they can adsorb new VOCs the next day. The vehicle’s engine acts as the Minotaur, by feeding on the VOCs. The ne

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ff

Epoch,Training Loss,Validation Loss


In [None]:
predictions = classification_trainer.inference()

submission_df['generated'] = predictions
submission_df.to_csv('../submission.csv', index=False)