In [29]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import evaluate
import torch
import numpy as np

In [32]:
torch.cuda.is_available()

True

In [31]:
torch.cuda.empty_cache()

In [4]:
dataset = load_dataset("mteb/tweet_sentiment_extraction")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})

In [6]:
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████████████████████████████████████████████████████████| 27481/27481 [00:11<00:00, 2482.37 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 3534/3534 [00:01<00:00, 2327.47 examples/s]


In [19]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 3534
    })
})


In [14]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2",num_labels=3)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tweets = [
    "I love this movie! It's fantastic.",
    "This is the worst experience I've ever had.",
    "It's okay, not the best but not the worst."
]

In [17]:
def classify_sentiment(tweets):
    inputs = tokenizer(tweets, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions

# Classify the sentiment of the tweets
sentiment_labels = classify_sentiment(tweets)

# Convert the numeric labels to human-readable form
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
sentiments = [label_map[label.item()] for label in sentiment_labels]

for tweet, sentiment in zip(tweets, sentiments):
    print(f"Tweet: {tweet} => Sentiment: {sentiment}")

Tweet: I love this movie! It's fantastic. => Sentiment: Positive
Tweet: This is the worst experience I've ever had. => Sentiment: Positive
Tweet: It's okay, not the best but not the worst. => Sentiment: Positive


In [30]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [33]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    save_strategy='epoch',
    load_best_model_at_end=True
)

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.572,0.990268,0.788342
1,0.5224,0.975052,0.783531
2,0.5253,1.09305,0.795416


TrainOutput(global_step=20610, training_loss=0.5262602464138217, metrics={'train_runtime': 15301.0164, 'train_samples_per_second': 5.388, 'train_steps_per_second': 1.347, 'total_flos': 4.308299723833344e+16, 'train_loss': 0.5262602464138217, 'epoch': 2.999890833666897})

In [35]:
trainer.evaluate()

{'eval_loss': 0.9750524759292603,
 'eval_accuracy': 0.7835314091680815,
 'eval_runtime': 210.6233,
 'eval_samples_per_second': 16.779,
 'eval_steps_per_second': 16.779,
 'epoch': 2.999890833666897}

In [37]:
model.save_pretrained("./model1")
tokenizer.save_pretrained("./model1")

('./model1\\tokenizer_config.json',
 './model1\\special_tokens_map.json',
 './model1\\vocab.json',
 './model1\\merges.txt',
 './model1\\added_tokens.json')

In [38]:
loaded_tokenizer = GPT2Tokenizer.from_pretrained("./model1")
loaded_model = GPT2ForSequenceClassification.from_pretrained("./model1")

In [40]:
def classify_sentiment(tweets):
    inputs = loaded_tokenizer(tweets, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions

# Classify the sentiment of the tweets
sentiment_labels = classify_sentiment(tweets)

# Convert the numeric labels to human-readable form
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
sentiments = [label_map[label.item()] for label in sentiment_labels]

for tweet, sentiment in zip(tweets, sentiments):
    print(f"Tweet: {tweet} => Sentiment: {sentiment}")

Tweet: I love this movie! It's fantastic. => Sentiment: Positive
Tweet: This is the worst experience I've ever had. => Sentiment: Negative
Tweet: It's okay, not the best but not the worst. => Sentiment: Neutral
