In [None]:
!pip install datasets
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [None]:
dataset = pd.read_csv('data.csv')
dataset = dataset.rename(columns={"Sentence":"text","Sentiment":"label"})
dataset['label'] = dataset['label'].replace({'neutral':2, 'positive':0, 'negative':1})
print(dataset)

                                                   text  label
0     The GeoSolutions technology will leverage Bene...      0
1     $ESI on lows, down $1.50 to $2.50 BK a real po...      1
2     For the last quarter of 2010 , Componenta 's n...      0
3     According to the Finnish-Russian Chamber of Co...      2
4     The Swedish buyout firm has sold its remaining...      2
...                                                 ...    ...
5837  RISING costs have forced packaging producer Hu...      1
5838  Nordic Walking was first used as a summer trai...      2
5839  According shipping company Viking Line , the E...      2
5840  In the building and home improvement trade , s...      2
5841  HELSINKI AFX - KCI Konecranes said it has won ...      0

[5842 rows x 2 columns]


In [None]:
dataset['label'] = dataset['label'].replace({'Neutral':2, 'Positive':0, 'Negative':1})
dataset

Unnamed: 0,text,label
0,The GeoSolutions technology will leverage Bene...,0
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",1
2,"For the last quarter of 2010 , Componenta 's n...",0
3,According to the Finnish-Russian Chamber of Co...,2
4,The Swedish buyout firm has sold its remaining...,2
...,...,...
5837,RISING costs have forced packaging producer Hu...,1
5838,Nordic Walking was first used as a summer trai...,2
5839,"According shipping company Viking Line , the E...",2
5840,"In the building and home improvement trade , s...",2


In [None]:
dataset.shape

(5842, 2)

In [None]:
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
dataset.duplicated().sum()

6

In [None]:
dataset.drop_duplicates(inplace=True)

In [None]:
df_train, df_test, = train_test_split(dataset, stratify=dataset['label'], test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.1, random_state=42)
print(df_train.shape, df_test.shape, df_val.shape)

(4726, 2) (584, 2) (526, 2)


In [None]:
finbert=AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert',num_labels=3)
tokenizer=AutoTokenizer.from_pretrained('ProsusAI/finbert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length' , max_length=128), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/4726 [00:00<?, ? examples/s]

Map:   0%|          | 0/526 [00:00<?, ? examples/s]

Map:   0%|          | 0/584 [00:00<?, ? examples/s]

In [None]:
!pip install accelerate



In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=finbert,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.4674,0.468432,0.809886
2,0.3046,0.445821,0.8327
3,0.2254,0.480422,0.809886


TrainOutput(global_step=1773, training_loss=0.31049846298467354, metrics={'train_runtime': 391.5345, 'train_samples_per_second': 36.211, 'train_steps_per_second': 4.528, 'total_flos': 932605509136896.0, 'train_loss': 0.31049846298467354, 'epoch': 3.0})

In [None]:
finbert.eval()
trainer.predict(dataset_test).metrics

{'test_loss': 0.41670456528663635,
 'test_accuracy': 0.8236301369863014,
 'test_runtime': 4.0049,
 'test_samples_per_second': 145.821,
 'test_steps_per_second': 9.239}

In [None]:
trainer.save_model('finbert-fine-tuned')

In [None]:
tuned_finbert=AutoModelForSequenceClassification.from_pretrained('finbert-fine-tuned',num_labels=3)
tokenizer=AutoTokenizer.from_pretrained('ProsusAI/finbert')
nlp=pipeline('sentiment-analysis',model=tuned_finbert,tokenizer=tokenizer)
results=dataset['text'].apply(lambda x: nlp(x)[0])
dataset['pred_Sentiment']=results.apply(lambda x: x['label'].lower())

In [None]:
dataset['label'] = dataset['label'].replace({2:"neutral", 0:"positive", 1:"negative"})
dataset

Unnamed: 0,text,label,pred_Sentiment
0,The GeoSolutions technology will leverage Bene...,positive,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,negative
2,"For the last quarter of 2010 , Componenta 's n...",positive,positive
3,According to the Finnish-Russian Chamber of Co...,neutral,neutral
4,The Swedish buyout firm has sold its remaining...,neutral,neutral
...,...,...,...
5837,RISING costs have forced packaging producer Hu...,negative,negative
5838,Nordic Walking was first used as a summer trai...,neutral,neutral
5839,"According shipping company Viking Line , the E...",neutral,neutral
5840,"In the building and home improvement trade , s...",neutral,neutral


In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
cm=confusion_matrix(dataset['label'],dataset['pred_Sentiment'])
accuracy=accuracy_score(dataset['label'],dataset['pred_Sentiment'])
print(cm)
print(accuracy)

[[ 408  440   12]
 [  93 2992   39]
 [  21   44 1787]]
0.8887936943111721
