In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
import torch
import os
import re
import warnings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
if torch.cuda.is_available():
    device = "cuda"
    current_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(current_device))
else:
    print("CPU")

NVIDIA GeForce RTX 3060


In [4]:
dataset = load_from_disk("datasets/imdb/")

In [32]:
# first_train = dataset["train"][:15_000]
# first_test = dataset["test"][:3_000]

In [6]:
first_test.keys()

dict_keys(['label', 'title', 'content'])

In [None]:
"distilbert/distilbert-base-uncased"

In [5]:
model_path = "xlnet/xlnet-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
class DataSet(torch.utils.data.Dataset):
    def __init__(self, data_dict:dict):
        super(DataSet, self).__init__()
        self.text = data_dict["content"]
        self.label = data_dict["label"]
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        encoded_items = tokenizer(self.text[idx],
                                  truncation=True,
                                  max_length=512,
                                  padding="max_length",
                                  return_tensors="pt")
        return {
            "input_ids": encoded_items["input_ids"].flatten(),
            "attention_mask": encoded_items["attention_mask"].flatten(),
            "label": torch.tensor([self.label[idx]])
        }

def get_dataset(data_dict):
    return Dataset(data_dict)

In [6]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    y_true = labels
    y_pred = np.argmax(predictions, axis=1)
    scores = predictions[:,1]

    accuracy_score_ = accuracy_score(y_true, y_pred)
    
    precision_score_0 = precision_score(y_true, y_pred, pos_label=False)
    precision_score_1 = precision_score(y_true, y_pred, pos_label=True)
    
    recall_score_0 = recall_score(y_true, y_pred, pos_label=False)
    recall_score_1 = recall_score(y_true, y_pred, pos_label=True)

    f1_score_0 = f1_score(y_true, y_pred, pos_label=False)
    f1_score_1 = f1_score(y_true, y_pred, pos_label=True)

    roc_auc = roc_auc_score(y_true, scores)

    result = {
        "Accuracy": accuracy_score_,
        "Precision-0": precision_score_0,
        "Recall-0": recall_score_0,
        "F1-0": f1_score_0,
        "Precision-1": precision_score_1,
        "Recall-1": recall_score_1,
        "F1-1": f1_score_1, 
        "ROC-AUC": roc_auc
    }
    
    return {k: round(v*100, 4) for k, v in result.items()}

In [8]:
# train_dataset = DataSet(first_train)
# test_dataset = DataSet(first_test)

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:18<00:00, 1318.32 examples/s]


In [9]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=512)

In [10]:
training_arguments = TrainingArguments(
    output_dir="tmp_models/impb_xlnet_finetuned",
    logging_dir="tmp_models/impb_xlnet_finetuned",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    logging_strategy="steps",
    logging_steps=1000,
    num_train_epochs=3,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=.01,
    dataloader_num_workers=8,
    save_total_limit=6, 
    lr_scheduler_type="linear",
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"], 
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [46]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy,Precision-0,Recall-0,F1-0,Precision-1,Recall-1,F1-1,Roc-auc
1000,0.5831,0.550936,79.788,77.4291,84.088,80.6213,82.5908,75.488,78.8798,87.5164
2000,0.4713,0.420096,82.08,83.1131,80.52,81.796,81.1094,83.64,82.3553,89.7797
3000,0.4008,0.436713,84.22,87.6375,79.68,83.4695,81.3715,88.76,84.9053,90.0915
4000,0.3496,0.491053,85.504,84.3127,87.24,85.7514,86.781,83.768,85.2479,92.5397
5000,0.3229,0.426324,85.344,82.3095,90.04,86.0014,89.0076,80.648,84.6218,93.1098
6000,0.337,0.412179,86.236,84.3534,88.976,86.6031,88.3369,83.496,85.8482,93.6721
7000,0.2749,0.437403,86.452,87.1625,85.496,86.3212,85.7681,87.408,86.5803,93.7091
8000,0.2779,0.474495,86.664,86.1892,87.32,86.7509,87.1514,86.008,86.5759,93.7792
9000,0.2691,0.436916,86.62,86.0973,87.344,86.7162,87.158,85.896,86.5224,93.9723


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=9375, training_loss=0.3617043375651042, metrics={'train_runtime': 3955.5112, 'train_samples_per_second': 18.961, 'train_steps_per_second': 2.37, 'total_flos': 9364642001374848.0, 'train_loss': 0.3617043375651042, 'epoch': 3.0})

In [43]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy,Precision-0,Recall-0,F1-0,Precision-1,Recall-1,F1-1,Roc-auc
1000,0.4975,0.539568,79.624,84.4529,72.616,78.0884,75.9823,86.632,80.9584,85.8815
2000,0.6776,0.694017,50.0,50.0,100.0,66.6667,0.0,0.0,0.0,85.3064
3000,0.6942,0.69325,50.0,0.0,0.0,0.0,50.0,100.0,66.6667,85.3877
4000,0.6935,0.693728,50.0,50.0,100.0,66.6667,0.0,0.0,0.0,84.7431
5000,0.6935,0.693177,50.0,50.0,100.0,66.6667,0.0,0.0,0.0,85.5557
6000,0.6932,0.693641,50.0,0.0,0.0,0.0,50.0,100.0,66.6667,85.7186
7000,0.6914,0.69342,50.0,0.0,0.0,0.0,50.0,100.0,66.6667,55.2018
8000,0.694,0.693238,50.0,0.0,0.0,0.0,50.0,100.0,66.6667,75.8352
9000,0.6933,0.693158,50.0,0.0,0.0,0.0,50.0,100.0,66.6667,23.4117


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=9375, training_loss=0.670715439453125, metrics={'train_runtime': 3964.7303, 'train_samples_per_second': 18.917, 'train_steps_per_second': 2.365, 'total_flos': 9364642001374848.0, 'train_loss': 0.670715439453125, 'epoch': 3.0})

In [13]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision-0,Recall-0,F1-0,Precision-1,Recall-1,F1-1,Roc-auc
1000,0.6185,0.6926,52.1,0.0,0.0,0.0,52.1,100.0,68.5076,63.9931
2000,0.7177,0.710614,52.1,0.0,0.0,0.0,52.1,100.0,68.5076,35.5469
3000,0.7114,0.693444,47.9,47.9,100.0,64.7735,0.0,0.0,0.0,47.4306
4000,0.7111,0.706112,47.9,47.9,100.0,64.7735,0.0,0.0,0.0,34.3804
5000,0.7033,0.695088,47.9,47.9,100.0,64.7735,0.0,0.0,0.0,69.9747


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=5625, training_loss=0.6929692654079861, metrics={'train_runtime': 6261.5589, 'train_samples_per_second': 7.187, 'train_steps_per_second': 0.898, 'total_flos': 1.281961257984e+16, 'train_loss': 0.6929692654079861, 'epoch': 3.0})

In [17]:
trainer.train()

Step,Training Loss,Validation Loss


<transformers.trainer_utils.EvalPrediction object at 0x7f35de1d5000>
[[0.05322411 0.3455544 ]
 [0.05331015 0.34534007]
 [0.05341507 0.34531856]
 ...
 [0.05275582 0.34620592]
 [0.05296462 0.34543565]
 [0.05318213 0.34570983]] [1 1 0 ... 0 1 1]


TypeError: argument 'ids': 'float' object cannot be interpreted as an integer