<a href="https://colab.research.google.com/github/RYU-MCFLY/Aplicaciones-Financieras/blob/main/CCP_10_2_Aps_Financieras5_Finbert_training14JUN23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MaxMitre/Aplicaciones-Financieras/blob/main/Semana9/2_FINBERT_Entrenamiento.ipynb)

# Instalaciones

In [None]:
!pip install transformers
!pip install datasets

# FinBERT

En esta clase, mejoraremos lo hecho en la clase anterior (una regresión logística multiclase para clasificación de sentimiento)

Esto se hará utilizando un modelo BERT con ligeros cambios, ha sido entrenado refinadamente para tomar en cuenta términos financieros y posteriormente compararemos los resultados con la regresión.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Ver todos los módulos instalados y sus versiones
!pip list

In [None]:
!pip install accelerate -U

In [None]:
# Info de un módulo en específico
!pip show transformers

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

In [None]:
import numpy as np
import pandas as pd
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# tested in transformers==4.18.0, pytorch==1.7.1
import torch
import transformers
torch.__version__, transformers.__version__

In [None]:
torch.cuda.is_available()

# Predicción con finBERT precargado

In [None]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [None]:
# Tareas a ejecutar
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

Primer predicción:

In [None]:
results = nlp('Growth is strong but we do not have liquidity.')  # 'Growth is strong and we have plenty of liquidity.'
print(results)

## ¿Qué significa Tokenizar?

In [None]:
tokenizer.tokenize('Growth is strong but we do not have liquidity.')

## Para que sirven los Tokens?

In [None]:
tokenizer.convert_tokens_to_ids(['growth', 'is', 'strong', 'but', 'we', 'do', 'not', 'have', 'liquidity', '.'])

Primer paso al procesar los textos.

In [None]:
tokenizer('Growth is strong but we do not have liquidity.')['input_ids']

# Carga de datos

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Cruso-ApsFinancieras/semana10/data/sentiment_data/Sentences_50Agree.txt', sep='.@', names=['text','label'], encoding='latin1', engine='python')
data

In [None]:
data = data.dropna(subset=['text', 'label'])

In [None]:
data = data.replace(['negative','neutral','positive'],[0,1,2])

In [None]:
df_train, df_test, = train_test_split(data, stratify=data['label'], test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.1, random_state=42)
print(df_train.shape, df_test.shape, df_val.shape)

In [None]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length' , max_length=128), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

## Ejercicio

¿Como se ven los datasets preparados?

Traten de inspeccionar los datos y ver como son

In [None]:
#@title Antes de preprocesar el objeto Dataset

Dataset.from_pandas(df_train)[0]

In [None]:
#@title Procesado
dataset_train[0]

# Configuración para entrenamiento

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=finbert,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)

In [None]:
trainer.train()


Evaluación

In [None]:
finbert.eval()
trainer.predict(dataset_test).metrics

In [None]:
trainer.predict(dataset_test)

In [None]:
y_pred = trainer.predict(dataset_test).predictions

In [None]:
y_pred

In [None]:
y_pred.argmax(axis=1)

In [None]:
comparativo = df_test.copy()

In [None]:
comparativo['predicted'] = y_pred.argmax(axis=1)

In [None]:
comparativo

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, log_loss

In [None]:
print(classification_report(comparativo['label'], comparativo['predicted']))

In [None]:
ConfusionMatrixDisplay.from_predictions(comparativo['label'], comparativo['predicted'])

In [None]:
comparativo['label'].value_counts()

In [None]:
comparativo['predicted'].value_counts()

Guardar el modelo refinado (fine tuned)

In [None]:
# Ojo, lo guardo en una carpeta que se borrará, es su decisión guardarlo en otro lado
trainer.save_model('finbert-sentiment/')

# Método "antiguo": TextBlob

In [None]:
text = "Later that day Apple said it was revising down its earnings expectations in \
the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. \
The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours \
trading and the decline was extended to more than 10% when the market opened. The dollar fell \
by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering \
some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. \
Yields on government bonds fell as investors fled to the traditional haven in a market storm."

In [None]:
import nltk
nltk.download('punkt')

In [None]:
result = pd.DataFrame()

In [None]:
from textblob import TextBlob

In [None]:
blob = TextBlob(text)
result['textblob_prediction'] = [sentence.sentiment.polarity for sentence in blob.sentences]
result

In [None]:
blob.sentences

In [None]:
print(f'Average sentiment is %.2f.' % (result.textblob_prediction.mean()))

Otro ejemplo

In [None]:
text2 = "Shares in the spin-off of South African e-commerce group Naspers surged more than 25% \
in the first minutes of their market debut in Amsterdam on Wednesday. Bob van Dijk, CEO of \
Naspers and Prosus Group poses at Amsterdam's stock exchange, as Prosus begins trading on the \
Euronext stock exchange in Amsterdam, Netherlands, September 11, 2019. REUTERS/Piroschka van de Wouw \
Prosus comprises Naspers’ global empire of consumer internet assets, with the jewel in the crown a \
31% stake in Chinese tech titan Tencent. There is 'way more demand than is even available, so that’s \
good,' said the CEO of Euronext Amsterdam, Maurice van Tilburg. 'It’s going to be an interesting \
hour of trade after opening this morning.' Euronext had given an indicative price of 58.70 euros \
per share for Prosus, implying a market value of 95.3 billion euros ($105 billion). The shares \
jumped to 76 euros on opening and were trading at 75 euros at 0719 GMT."

In [None]:
result2 = pd.DataFrame()
blob = TextBlob(text2)
result2['textblob_prediction'] = [sentence.sentiment.polarity for sentence in blob.sentences]

In [None]:
result2

In [None]:
blob.sentences

In [None]:
print(f'Average sentiment is %.2f.' % (result2.textblob_prediction.mean()))

# Referencias

- Paper: https://arxiv.org/pdf/1908.10063.pdf
- https://github.com/yya518/FinBERT/blob/master/finetune.ipynb
- https://huggingface.co/yiyanghkust/finbert-tone
- https://www.tensorflow.org/text/tutorials/classify_text_with_bert

Paper con código:
- http://nlp.seas.harvard.edu/annotated-transformer/