In [None]:
!pip install datasets
from datasets import Dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,pipeline



In [None]:
dataset=pd.read_csv('all-data.csv',encoding='iso-8859-1',names=['label','text'])
dataset.head()

Unnamed: 0,label,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [None]:
dataset['label'] = dataset['label'].replace({'neutral':2, 'positive':0, 'negative':1})
print(dataset)

      label                                               text
0         2  According to Gran , the company has no plans t...
1         2  Technopolis plans to develop in stages an area...
2         1  The international electronic industry company ...
3         0  With the new production plant the company woul...
4         0  According to the company 's updated strategy f...
...     ...                                                ...
4841      1  LONDON MarketWatch -- Share prices ended lower...
4842      2  Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843      1  Operating profit fell to EUR 35.4 mn from EUR ...
4844      1  Net sales of the Paper segment decreased to EU...
4845      1  Sales in Finland decreased by 10.5 % in Januar...

[4846 rows x 2 columns]


In [None]:
dataset.shape

(4846, 2)

In [None]:
dataset.isnull().sum()

label    0
text     0
dtype: int64

In [None]:
dataset['text'].duplicated().sum()

8

In [None]:
dataset.drop_duplicates(subset=['text'],keep='first',inplace=True)

In [None]:
df_train, df_test, = train_test_split(dataset, stratify=dataset['label'], test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.1, random_state=42)
print(df_train.shape, df_test.shape, df_val.shape)

(3918, 2) (484, 2) (436, 2)


In [None]:
finbert=AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer=AutoTokenizer.from_pretrained('ProsusAI/finbert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length' , max_length=128), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/3918 [00:00<?, ? examples/s]

Map:   0%|          | 0/436 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

In [None]:
!pip install accelerate



In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=finbert,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.345717,0.915138
2,0.137700,0.355874,0.919725
3,0.127700,0.36221,0.931193


TrainOutput(global_step=1470, training_loss=0.10098970893288957, metrics={'train_runtime': 361.1066, 'train_samples_per_second': 32.55, 'train_steps_per_second': 4.071, 'total_flos': 773158777993728.0, 'train_loss': 0.10098970893288957, 'epoch': 3.0})

In [None]:
finbert.eval()
trainer.predict(dataset_test).metrics

{'test_loss': 0.5886762738227844,
 'test_accuracy': 0.8884297520661157,
 'test_runtime': 3.6064,
 'test_samples_per_second': 134.206,
 'test_steps_per_second': 8.596}

In [None]:
trainer.save_model('Finbert-fine-tuned')

In [None]:
tuned_finbert=AutoModelForSequenceClassification.from_pretrained('Finbert-fine-tuned',num_labels=3)
tokenizer=AutoTokenizer.from_pretrained('ProsusAI/finbert')
nlp=pipeline('sentiment-analysis',model=tuned_finbert,tokenizer=tokenizer)
results=dataset['text'].apply(lambda x: nlp(x)[0])
dataset['pred_Sentiment']=results.apply(lambda x: x['label'].lower())

In [None]:
dataset['label'] = dataset['label'].replace({2:"neutral", 0:"positive", 1:"negative"})
dataset

Unnamed: 0,label,text,pred_Sentiment
0,neutral,"According to Gran , the company has no plans t...",neutral
1,neutral,Technopolis plans to develop in stages an area...,neutral
2,negative,The international electronic industry company ...,negative
3,positive,With the new production plant the company woul...,positive
4,positive,According to the company 's updated strategy f...,positive
...,...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...,negative
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,neutral
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...,negative
4844,negative,Net sales of the Paper segment decreased to EU...,negative


In [None]:
cm=confusion_matrix(dataset['label'],dataset['pred_Sentiment'])
accuracy=accuracy_score(dataset['label'],dataset['pred_Sentiment'])
print(cm)
print(accuracy)

[[ 590    9    5]
 [  12 2828   32]
 [   3   35 1324]]
0.9801570897064903
