# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets transformers huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading 

In [3]:
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.18.0


In [4]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle
  Downloading setproctitle-1.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.21.0-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.5/199.5 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m

In [5]:
import os
from os import chdir
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datasets import load_metric, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForPreTraining
from transformers import BertForSequenceClassification
from transformers import TFBertModel, BertTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import matplotlib.pyplot as plt
from typing import Dict, Any, Optional
from tqdm.std import tqdm
from google.colab import runtime

In [6]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [7]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
chdir('/content/drive/MyDrive/pantanal.dev/artificial-intelligence')

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [10]:
seed = 42
np.random.seed(seed)

In [11]:
def disconnect():
    runtime.unassign()

In [12]:
def tokenize_function(examples, padding='max_length', truncation=True, max_length=512):
    return tokenizer(examples['text'], padding=padding, truncation=truncation, max_length=max_length)

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
def predict_sentiment(input_text=None):
    input_tokens = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512)

    input_tokens.to(device)

    with torch.no_grad():
        output = model(**input_tokens)
    
    logits = output.logits
    probabilities = F.softmax(logits, dim=-1)
    probabilities_np = probabilities.cpu().numpy()

    formatted_probabilities = np.array2string(probabilities_np, precision=6, suppress_small=True)

    predicted_class_idx = torch.argmax(probabilities, dim=1).item()

    '''
    return {
        'probabilities': formatted_probabilities,
        'predicted_class_idx': predicted_class_idx
    }
    '''

    return predicted_class_idx

# Carregar dataset

In [22]:
train_df = pd.read_csv('datasets/train_df.csv', sep='|')
val_df = pd.read_csv('datasets/val_df.csv', sep='|')
test_df = pd.read_csv('datasets/test_df.csv', sep='|')

## Treinamento: bert-base-multilingual-cased-02

In [16]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [18]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7041 [00:00<?, ? examples/s]

Map:   0%|          | 0/1510 [00:00<?, ? examples/s]

Map:   0%|          | 0/1509 [00:00<?, ? examples/s]

In [23]:
repo_name = 'bert-base-multilingual-cased-04'

training_args = TrainingArguments(
    output_dir=f'./trainings/{repo_name}/',
    seed=seed,
    auto_find_batch_size=True,
    num_train_epochs=12,
    learning_rate=5e-6, # default 5e-5
    weight_decay=0.01,
    eval_steps=100,
    logging_steps=100,
    save_steps=1000,
    save_strategy="steps",
    evaluation_strategy="steps",
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()
trainer.save_model(f'./trainings/{repo_name}/')

[34m[1mwandb[0m: Currently logged in as: [33mtiagosanti[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.9834,0.882714,0.584768,0.466601,0.46363,0.584768
200,0.8631,0.763643,0.638411,0.585933,0.627027,0.638411
300,0.7566,0.719072,0.655629,0.626425,0.65959,0.655629
400,0.6989,0.687523,0.689404,0.668568,0.670296,0.689404
500,0.6934,0.66914,0.702649,0.703055,0.727991,0.702649
600,0.6407,0.607777,0.743046,0.744318,0.751467,0.743046
700,0.6115,0.589744,0.748344,0.753611,0.773753,0.748344
800,0.5862,0.575393,0.753642,0.756188,0.765587,0.753642
900,0.551,0.576711,0.759603,0.761741,0.771841,0.759603
1000,0.4703,0.5991,0.782781,0.778216,0.782719,0.782781


In [25]:
wandb.finish()

0,1
eval/accuracy,▁▃▆▆▇▇▇▇▇███▇█▇▇████████████████████████
eval/f1,▁▄▇▇▇▇▇█▇███████████████████████████████
eval/loss,▄▂▂▁▁▁▁▂▁▂▂▃▄▂▄▄▄▄▅▅▅▇▆▆▆▆▆▇▇▇▇▇████████
eval/precision,▁▅▇▇▇███████████████████████████████████
eval/recall,▁▃▆▆▇▇▇▇▇███▇█▇▇████████████████████████
eval/runtime,▇▅▄▁▂▁▂▃▂▃▁▄▂▁▂▂▃▃▃▅▄▄▄▄▄▃▄▆▇▇▇██▆█▇▅▆▅▇
eval/samples_per_second,▂▄▅█▇█▇▆▇▆█▅▇█▇▇▆▆▆▄▅▅▅▅▅▆▅▂▂▂▂▁▁▃▁▂▃▃▄▂
eval/steps_per_second,▂▄▅█▇█▇▆▇▆█▅▇█▇▇▆▆▆▄▅▅▅▄▅▆▅▂▁▂▂▁▁▃▁▂▃▃▃▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.8053
eval/f1,0.80533
eval/loss,1.35973
eval/precision,0.80543
eval/recall,0.8053
eval/runtime,49.0663
eval/samples_per_second,30.775
eval/steps_per_second,3.852
train/epoch,12.0
train/global_step,10572.0


In [None]:
disconnect()

## Teste do modelo

In [18]:
repo_name = 'bert-base-multilingual-cased-04'

In [19]:
model = BertForSequenceClassification.from_pretrained(f'./trainings/{repo_name}/')
tokenizer = BertTokenizer.from_pretrained(f'./trainings/{repo_name}/')

In [20]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [24]:
%timeit predict_sentiment(test_df.loc[0, 'text'])

100 ms ± 21.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
len(test_df)

1509

In [23]:
model.to(device)
test_df['pred'] = test_df['text'].apply(predict_sentiment)
test_df

Unnamed: 0,text,label,lang,font,pred
0,O projeto está previsto para ser concluído em ...,1,ptbr,financial-phrase-bank,1
1,Senado aprova intervenção no DF; mercado opera...,1,ptbr,b3,1
2,"In August-October 2010 , the company 's result...",2,eng,financial-phrase-bank,2
3,Earnings per share for the quarter were also h...,2,eng,financial-phrase-bank,2
4,"A caixa do produto Tekla Structures, se necess...",1,ptbr,financial-phrase-bank,1
...,...,...,...,...,...
1504,"Com a aquisição, a Panostaja expande ainda mai...",2,ptbr,financial-phrase-bank,2
1505,"O lucro operacional foi de EUR -0,1 milhões, a...",0,ptbr,financial-phrase-bank,0
1506,Technopolis é o principal operador de technopa...,2,ptbr,financial-phrase-bank,1
1507,"Como resultado dessas negociações, a empresa d...",0,ptbr,financial-phrase-bank,0


In [25]:
precision, recall, f1, _ = precision_recall_fscore_support(test_df['label'], test_df['pred'], average='weighted')
acc = accuracy_score(test_df['label'], test_df['pred'])

print({
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
})

{'accuracy': 0.8257123923127899, 'f1': 0.826380462075082, 'precision': 0.8278741828811275, 'recall': 0.8257123923127899}


In [26]:
test_df['diff'] = np.abs(test_df['label']-test_df['pred'])
test_df

Unnamed: 0,text,label,lang,font,pred,diff
0,O projeto está previsto para ser concluído em ...,1,ptbr,financial-phrase-bank,1,0
1,Senado aprova intervenção no DF; mercado opera...,1,ptbr,b3,1,0
2,"In August-October 2010 , the company 's result...",2,eng,financial-phrase-bank,2,0
3,Earnings per share for the quarter were also h...,2,eng,financial-phrase-bank,2,0
4,"A caixa do produto Tekla Structures, se necess...",1,ptbr,financial-phrase-bank,1,0
...,...,...,...,...,...,...
1504,"Com a aquisição, a Panostaja expande ainda mai...",2,ptbr,financial-phrase-bank,2,0
1505,"O lucro operacional foi de EUR -0,1 milhões, a...",0,ptbr,financial-phrase-bank,0,0
1506,Technopolis é o principal operador de technopa...,2,ptbr,financial-phrase-bank,1,1
1507,"Como resultado dessas negociações, a empresa d...",0,ptbr,financial-phrase-bank,0,0


In [27]:
test_length = test_df.shape[0]
agree_count = test_df[test_df["diff"]==0].shape[0]
partial_disagree_count = test_df[test_df["diff"]==1].shape[0]
disagree_count = test_df[test_df["diff"]==2].shape[0]

print(f'Concordância: {agree_count} ({agree_count/test_length*100:.2f}%)')
print(f'Discordância parcial: {partial_disagree_count} ({partial_disagree_count/test_length*100:.2f}%)')
print(f'Discordância: {disagree_count} ({disagree_count/test_length*100:.2f}%)')

Concordância: 1246 (82.57%)
Discordância parcial: 226 (14.98%)
Discordância: 37 (2.45%)


In [43]:
disagree = test_df[test_df['diff']==2]
disagree.value_counts(subset=['font', 'lang'])/len(disagree)

font                   lang
financial-phrase-bank  ptbr    0.378378
b3                     ptbr    0.189189
infomoney              ptbr    0.189189
financial-phrase-bank  eng     0.162162
google-news            eng     0.054054
                       ptbr    0.027027
dtype: float64

In [44]:
partial_disagree = test_df[test_df['diff']==1]
partial_disagree.value_counts(subset=['font', 'lang'])/len(partial_disagree)

font                   lang
financial-phrase-bank  ptbr    0.491150
infomoney              ptbr    0.283186
b3                     ptbr    0.097345
google-news            eng     0.084071
                       ptbr    0.030973
financial-phrase-bank  eng     0.013274
dtype: float64

In [45]:
agree = test_df[test_df['diff']==0]
agree.value_counts(subset=['font', 'lang'])/len(agree)

font                   lang
financial-phrase-bank  ptbr    0.479936
                       eng     0.264045
infomoney              ptbr    0.102729
b3                     ptbr    0.074639
google-news            ptbr    0.040128
                       eng     0.038523
dtype: float64

In [48]:
test_df[test_df['diff']==2]

Unnamed: 0,text,label,lang,font,pred,diff
56,"Além disso, a empresa vai reduzir no máximo de...",0,ptbr,financial-phrase-bank,2,2
61,"O lucro operacional foi de EUR 1,6 milhões em ...",0,ptbr,financial-phrase-bank,2,2
109,A Comissão da UE disse anteriormente que multo...,0,ptbr,financial-phrase-bank,2,2
181,3M Earnings: 6000 More Job Cuts Won't Fix Comp...,0,eng,google-news,2,2
225,"Based on the first quarter result , existing o...",0,eng,financial-phrase-bank,2,2
261,"O retorno sobre o investimento ROI foi de 4,1%...",0,ptbr,financial-phrase-bank,2,2
285,"GM Raises 2023 Profit Outlook, Kills Off Chevy...",2,eng,google-news,0,2
363,Produtos e serviços consumidos durante o verão...,0,ptbr,b3,2,2
547,"Os resultados básicos da TeliaSonera, no entan...",0,ptbr,financial-phrase-bank,2,2
559,Os acionistas da Rakvere Lihakombinaat decidir...,2,ptbr,financial-phrase-bank,0,2
