In [1]:
import torch

In [42]:
# dataset

from datasets import load_dataset, Dataset
import pandas as pd

DATASET_ID = "emad12/stock_tweets_sentiment"
train_dataset = load_dataset(DATASET_ID, split="train")
test_dataset = load_dataset(DATASET_ID, split="test")

src_col , tgt_col= "tweet", "sentiment"
max_len = 32
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
train_df = train_df[[src_col, tgt_col]]
test_df = test_df[[src_col, tgt_col]]
train_df[src_col] = train_df[src_col].apply(lambda x: x.lower())
test_df[src_col] = test_df[src_col].apply(lambda x: x.lower())
train_df[tgt_col] = train_df[tgt_col].apply(lambda x: 2 if x==-1 else x)
test_df[tgt_col] = test_df[tgt_col].apply(lambda x: 2 if x==-1 else x)
train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)

In [53]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [54]:
tokenizer("I am good", truncation=True, padding=True, max_length=32)

{'input_ids': [101, 1045, 2572, 2204, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [35]:
train_df.column_names

['tweet', 'sentiment']

In [47]:
def tokenize_function(x):
    model_inp = tokenizer(x[src_col], truncation=True, padding=True, max_length=max_len)
    labels = torch.tensor(x[tgt_col], dtype=torch.int)
    model_inp["labels"] = labels
    return model_inp

In [48]:
train_df = train_df.map(tokenize_function, batched=True, remove_columns=train_df.column_names)
test_df = test_df.map(tokenize_function, batched=True, remove_columns=test_df.column_names)

Map:   0%|          | 0/96000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

In [66]:
# model
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
ID2LABEL = {0: "NEUTRAL", 1: "POSITIVE", 2:"NEGATIVE"}
LABEL2ID = {"NEUTRAL": 0, "POSITIVE": 1, "NEGATIVE": 2}


model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                            id2label=ID2LABEL,
                                                            label2id=LABEL2ID,
                                                            num_labels=len(ID2LABEL)
                                                            ).to(device)
train_args  = TrainingArguments(output_dir = "sentiment_classification",
                                num_train_epochs=10,
                                learning_rate = 2E-5,
                                per_device_train_batch_size = 16,
                                per_device_eval_batch_size = 16,
                                weight_decay = 0.01,
                                evaluation_strategy = "epoch",
                                save_strategy = "epoch",
                                logging_strategy = "epoch",
                                )
eval_comp = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return eval_comp.compute(predictions=predictions, references=labels)
trainer = Trainer(model = model,
                args = train_args,
                data_collator = DataCollatorWithPadding(tokenizer),
                train_dataset = train_df,
                eval_dataset = test_df,
                compute_metrics = compute_metrics
            )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5452,0.454602,0.815625
2,0.3972,0.431669,0.826958
3,0.3096,0.511717,0.818167
4,0.2341,0.556254,0.82225


KeyboardInterrupt: 

In [83]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model='sentiment_classification/checkpoint-12000', tokenizer="distilbert-base-uncased")
classifier("it is good but we have to be carefull")

[{'label': 'POSITIVE', 'score': 0.7584418654441833}]