In [101]:
import pandas as pd
import numpy as np
import torch

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [102]:
train_df = pd.read_csv('./train_v2_drcat_02.csv')
kf_df = pd.read_csv('./kf_df.csv')
num_rows = kf_df.shape[0]
num_rows

59668

In [103]:
kf_df = kf_df.rename(columns={'prompt_title': 'prompt_name'})
kf_df['label'] = 1
kf_df['source'] = 'kf'
kf_df['RDizzl3_seven'] = False

In [104]:
train_df = pd.concat([train_df, kf_df[train_df.columns].sample(30000, random_state=42)])
train_df

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False
...,...,...,...,...,...
29644,The article discusses the concept of domestica...,1,Are Humans More Like Wolves or Dogs?,kf,False
42301,Background noise can make it difficult to hear...,1,I Can’t Hear Myself Think! How the Brain Deals...,kf,False
46584,"Long ago, there were many different species of...",1,What Would the Child of a Human and a Neandert...,kf,False
52305,"Chemotherapy drugs are used to treat cancer, b...",1,Getting to the Bottom of Cancer Treatment Pain,kf,False


In [105]:
train_df["generated"] = train_df["label"].apply(lambda x: 1.0 if x == 1 else 0.0)
train_df["human"] = train_df["label"].apply(lambda x: 1.0 if x == 0 else 0.0)
train_df

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,generated,human
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,0.0,1.0
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,0.0,1.0
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,0.0,1.0
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,0.0,1.0
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,0.0,1.0
...,...,...,...,...,...,...,...
29644,The article discusses the concept of domestica...,1,Are Humans More Like Wolves or Dogs?,kf,False,1.0,0.0
42301,Background noise can make it difficult to hear...,1,I Can’t Hear Myself Think! How the Brain Deals...,kf,False,1.0,0.0
46584,"Long ago, there were many different species of...",1,What Would the Child of a Human and a Neandert...,kf,False,1.0,0.0
52305,"Chemotherapy drugs are used to treat cancer, b...",1,Getting to the Bottom of Cancer Treatment Pain,kf,False,1.0,0.0


In [106]:
train, test = train_test_split(train_df, test_size=0.30, random_state=42, shuffle=True, stratify=train_df["label"])
train.to_csv("train.csv")
test.to_csv("test.csv")

In [107]:
train.groupby("label").count()

Unnamed: 0_level_0,text,prompt_name,source,RDizzl3_seven,generated,human
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,19159,19159,19159,19159,19159,19159
1,33248,33248,33248,33248,33248,33248


In [108]:
test.groupby("label").count()

Unnamed: 0_level_0,text,prompt_name,source,RDizzl3_seven,generated,human
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8212,8212,8212,8212,8212,8212
1,14249,14249,14249,14249,14249,14249


In [109]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LABELS = ['generated', 'human']
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

In [110]:
def read_csv_binary(filename):
    data = pd.read_csv(filename)
    texts = data['text'].tolist()
    labels = data[LABELS].values

    return texts, labels

In [111]:
class LLMDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }

        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.labels)

In [112]:
class My_TextClassifier_Model():
    def __init__(self,
                 pretrained_transformer_name='distilbert-base-cased',
                 dataset_dct={'train':'train.csv', 'test':'test.csv'},
                 warmup_steps=100,
                 num_train_epochs=3):
        max_samples = {
            'train': 10000,
            'val': 10000,
            'test': 10000,
        }
        
        train_texts, train_labels = read_csv_binary(dataset_dct['train'])
        
        if 'test' not in dataset_dct:
            train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=.1)
        else:
            test_texts, test_labels = read_csv_binary(dataset_dct['test'])
            
        train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1)
        
        train_texts = train_texts[:max_samples['train']]
        val_texts = val_texts[:max_samples['val']]
        test_texts = test_texts[:max_samples['test']]

        train_labels = train_labels[:max_samples['train']]
        val_labels = val_labels[:max_samples['val']]
        test_labels = test_labels[:max_samples['test']]

        self.tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_transformer_name)
        
        train_encodings = self.tokenizer(train_texts, truncation=True, max_length=256, padding=True)
        val_encodings = self.tokenizer(val_texts, truncation=True, max_length=256, padding=True)
        test_encodings = self.tokenizer(test_texts, truncation=True, max_length=256, padding=True)

        self.train_dataset = LLMDDataset(train_encodings, train_labels)
        self.val_dataset = LLMDDataset(val_encodings, val_labels)
        self.test_dataset = LLMDDataset(test_encodings, test_labels)
        
        self.model = DistilBertForSequenceClassification.from_pretrained(pretrained_transformer_name,
                                                                         num_labels=len(LABELS), 
                                                                         id2label=id2label,
                                                                         label2id=label2id)

        self.training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=warmup_steps,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_strategy='epoch',
            evaluation_strategy='epoch',
            save_strategy='epoch',
            save_total_limit = 3
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            compute_metrics=self.compute_metrics
        )
        
    def compute_metrics(self, p: EvalPrediction):
        y_true = p.label_ids
        
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.Tensor(p.predictions))
        y_pred = np.zeros(probs.shape)
        y_pred[np.where(probs >= 0.5)] = 1
        
        f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
        roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
        accuracy = accuracy_score(y_true, y_pred)
        
        result = {'f1': f1_micro_average,
                  'roc_auc': roc_auc,
                  'accuracy': accuracy}
        return result
    
    def inference(self, predict_dataset=None):
        if predict_dataset is None:
            predict_dataset = self.test_dataset
        predictions = self.trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
        predictions = np.argmax(predictions, axis=1)

        return predictions

In [113]:
classification_trainer = My_TextClassifier_Model(
    pretrained_transformer_name='distilbert-base-cased',
    dataset_dct={'train':'train.csv', 'test': 'test.csv'},
    warmup_steps=100,
    num_train_epochs=3)
classification_trainer.trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1179,0.054882,0.98569,0.98569,0.985499
2,0.0233,0.028,0.992749,0.992749,0.992749
3,0.0086,0.026576,0.995134,0.995135,0.995039


TrainOutput(global_step=3750, training_loss=0.049918607330322266, metrics={'train_runtime': 13715.1375, 'train_samples_per_second': 2.187, 'train_steps_per_second': 0.273, 'total_flos': 1987010979840000.0, 'train_loss': 0.049918607330322266, 'epoch': 3.0})

In [114]:
trained_model = classification_trainer.model
#dummy_model_input = {
#    'input_ids': torch.zeros((1, 256), dtype=torch.long),
#    'attention_mask': torch.ones((1, 256), dtype=torch.long)
#}

#torch.onnx.export(model=trained_model,
#                 args=tuple(dummy_model_input.values()),
#                 f="torch-model.onnx",
#                 input_names=['input_ids', 'attention_mask'],
#                 output_names=['logits'],
#                 dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}, 
#                               'attention_mask': {0: 'batch_size', 1: 'sequence'}, 
#                               'logits': {0: 'batch_size', 1: 'sequence'}})

In [115]:
metrics = classification_trainer.trainer.evaluate()

classification_trainer.trainer.log_metrics("after_train_eval", metrics)
classification_trainer.trainer.save_metrics("after_train_eval", metrics)

metrics

***** after_train_eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =      0.995
  eval_f1                 =     0.9951
  eval_loss               =     0.0266
  eval_roc_auc            =     0.9951
  eval_runtime            = 0:11:18.58
  eval_samples_per_second =      7.723
  eval_steps_per_second   =      0.967


{'eval_loss': 0.026576348309042833,
 'eval_f1': 0.9951340520942659,
 'eval_roc_auc': 0.9951345163136807,
 'eval_accuracy': 0.9950391146727724,
 'eval_runtime': 678.5884,
 'eval_samples_per_second': 7.723,
 'eval_steps_per_second': 0.967,
 'epoch': 3.0}

In [117]:
preds = classification_trainer.inference()
test = test[:10000]
test["pred_label"] = [id2label[x] for x in preds]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["pred_label"] = [id2label[x] for x in preds]


In [118]:
results = test[["text", "label", "pred_label"]]
results

Unnamed: 0,text,label,pred_label
35115,"[Your Name]\n[Your Address]\n[City, State, ZIP...",1,generated
33807,I completely agree with the statement that a ...,1,generated
35980,First impressions are what I learn from many ...,1,generated
9484,Studying Venus is a worthy pursuit despite the...,0,human
36232,"Hey, so like, I know that first impressions ar...",1,generated
...,...,...,...
15393,The Arctic environment is changing rapidly due...,1,generated
18519,The car can be very helpful in some way. If a ...,0,human
33605,Soilless agriculture is a new type of environm...,1,generated
27856,Introduction\n\nCurfew laws have been implemen...,1,generated


In [119]:
results.to_csv('./result.csv', index=False)

In [121]:
trained_tokenizer = classification_trainer.tokenizer

save_directory = "./saved_model/"
trained_model.save_pretrained(save_directory)
trained_tokenizer.save_pretrained(save_directory)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')