In [1]:
%pip install datasets
%pip install huggingface
%pip install evaluate

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)
Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: huggingface
Successfully installed huggingface-0.0.1
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [5]:
import os
import pandas as pd
import evaluate

from datasets import load_dataset
from transformers import GPT2Tokenizer
from transformers import GPT2ForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np

In [6]:
os.environ["WANDB_DISABLED"] = "true"


In [7]:
# Load dataset
dataset = load_dataset("mteb/tweet_sentiment_extraction")
df = pd.DataFrame(dataset['train'])

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/240k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3432 [00:00<?, ? examples/s]

In [8]:
# Show the dataset
df.head()

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative


In [9]:
# Tokenize the prompt using the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
# Replace the pad_token with eos_token
tokenizer.pad_token = tokenizer.eos_token

In [11]:
# Function to tokenize each examples
def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", truncation=True)

In [12]:
# Tokenized dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/26732 [00:00<?, ? examples/s]

Map:   0%|          | 0/3432 [00:00<?, ? examples/s]

In [14]:
# Split dataset into TRAIN and EVAL partition
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


In [15]:
# We select a small fragment of the data for illustration due to resource limitations

small_train_dataset = small_train_dataset.select(range(100))
small_eval_dataset = small_eval_dataset.select(range(100))

In [16]:
# Load the model
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Define the metric for evaluation
metric = evaluate.load("accuracy")

Downloading builder script: 0.00B [00:00, ?B/s]

In [18]:
def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)


In [19]:
# Training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    report_to=None,
    num_train_epochs=2
)



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [21]:
# Evaluate before training
print("Before Training:", trainer.evaluate())



Before Training: {'eval_loss': 4.199751377105713, 'eval_model_preparation_time': 0.0025, 'eval_accuracy': 0.33, 'eval_runtime': 503.7365, 'eval_samples_per_second': 0.199, 'eval_steps_per_second': 0.199}


In [22]:
# Train
trainer.train()



Step,Training Loss


TrainOutput(global_step=50, training_loss=1.4407411193847657, metrics={'train_runtime': 3718.7887, 'train_samples_per_second': 0.054, 'train_steps_per_second': 0.013, 'total_flos': 104519643955200.0, 'train_loss': 1.4407411193847657, 'epoch': 2.0})

In [23]:
# Evaluate before training
print("Before Training:", trainer.evaluate())



Before Training: {'eval_loss': 1.1260432004928589, 'eval_model_preparation_time': 0.0025, 'eval_accuracy': 0.31, 'eval_runtime': 467.587, 'eval_samples_per_second': 0.214, 'eval_steps_per_second': 0.214, 'epoch': 2.0}
