# Setup

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install datasets transformers huggingface_hub wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.15.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 

In [4]:
import os
from os import chdir
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datasets import load_metric, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForPreTraining
from transformers import BertForSequenceClassification
from transformers import TFBertModel, BertTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from transformers import TrainerState
import matplotlib.pyplot as plt
from typing import Dict, Any, Optional
from tqdm.std import tqdm
from google.colab import runtime

In [5]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [None]:
import wandb
wandb.login()

In [None]:
wandb.init(project='Tuiaia')

In [6]:
REPO_DIR = '/content/drive/Othercomputers/Desktop/pantanal.dev/artificial-intelligence'
os.chdir(REPO_DIR)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [8]:
seed = 42
np.random.seed(seed)

In [9]:
def disconnect():
    runtime.unassign()

In [10]:
def tokenize_function(examples, padding='max_length', truncation=True, max_length=512):
    return tokenizer(examples['text'], padding=padding, truncation=truncation, max_length=max_length)

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [12]:
def predict_sentiment(input_text, model):
    input_tokens = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512)

    input_tokens.to(device)

    with torch.no_grad():
        output = model(**input_tokens)
    
    logits = output.logits
    probabilities = F.softmax(logits, dim=-1)
    predicted_class_idx = torch.argmax(probabilities, dim=1).item()
    
    return predicted_class_idx

# Carregar dataset

In [13]:
train_df = pd.read_csv('datasets/train_df.csv', sep='|')
val_df = pd.read_csv('datasets/val_df.csv', sep='|')
test_df = pd.read_csv('datasets/test_df.csv', sep='|')

# Treinamento do modelo

In [14]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
model = BertForSequenceClassification.from_pretrained('./trainings/bert-base-multilingual-cased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7858 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
repo_name = 'bert-base-multilingual-cased-06'

training_args = TrainingArguments(
    output_dir=f'./trainings/{repo_name}/',
    seed=seed,
    per_device_train_batch_size=14,
    per_device_eval_batch_size=14,
    num_train_epochs=15,
    learning_rate=5e-6,
    weight_decay=0.01,
    logging_steps=100,
    eval_steps=100,
    evaluation_strategy="steps",
    save_steps=1000,
    save_strategy="steps",
    save_total_limit=1,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train(resume_from_checkpoint='./trainings/bert-base-multilingual-cased-06/checkpoint-7000')
trainer.save_model(f'./trainings/{repo_name}/')



  0%|          | 0/256 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mtiagosanti[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
7100,0.0872,1.132905,0.813539,0.813718,0.813973,0.813539
7200,0.059,1.14672,0.814727,0.813474,0.813462,0.814727
7300,0.0754,1.164244,0.815914,0.815999,0.816469,0.815914
7400,0.0784,1.155364,0.814727,0.815442,0.816715,0.814727
7500,0.0741,1.164384,0.814727,0.814933,0.815365,0.814727
7600,0.076,1.173463,0.809976,0.809783,0.810223,0.809976
7700,0.0643,1.167802,0.817102,0.817256,0.817445,0.817102
7800,0.0752,1.164049,0.817102,0.816001,0.815892,0.817102
7900,0.0814,1.163184,0.82304,0.822068,0.821985,0.82304
8000,0.0554,1.162783,0.822447,0.822216,0.822055,0.822447


In [None]:
wandb.finish()

0,1
eval/accuracy,▃▄▄▄▄▁▅▅██▆▇▅█
eval/f1,▃▃▄▄▄▁▅▄██▆▇▆█
eval/loss,▁▃▆▅▆█▇▆▆▆█▇▆▆
eval/precision,▃▃▅▅▄▁▅▄██▆▇▅█
eval/recall,▃▄▄▄▄▁▅▅██▆▇▅█
eval/runtime,█▁▆▂▂▃▆▆▃▄▄▂▃▂
eval/samples_per_second,▁█▃▇▇▆▃▃▆▅▅▇▆▇
eval/steps_per_second,▁█▃▇▇▆▃▃▆▅▅▇▆▇
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███

0,1
eval/accuracy,0.82245
eval/f1,0.82227
eval/loss,1.16563
eval/precision,0.82213
eval/recall,0.82245
eval/runtime,17.3834
eval/samples_per_second,96.874
eval/steps_per_second,6.961
train/epoch,15.0
train/global_step,8430.0


In [None]:
disconnect()

# Teste do modelo

In [15]:
def test_model(model, test_df):
    model.eval()
    model.to(device)

    test_df['pred'] = test_df['text'].apply(predict_sentiment, model=model)

    precision, recall, f1, _ = precision_recall_fscore_support(test_df['label'], test_df['pred'], average='weighted')
    acc = accuracy_score(test_df['label'], test_df['pred'])

    print({
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    })

    test_df['diff'] = np.abs(test_df['label']-test_df['pred'])
    agree = test_df[test_df['diff']==0]
    partial_disagree = test_df[test_df['diff']==1]
    disagree = test_df[test_df['diff']==2]

    test_length = test_df.shape[0]
    agree_count = agree.shape[0]
    partial_disagree_count = partial_disagree.shape[0]
    disagree_count = disagree.shape[0]

    print(f'Concordância: {agree_count} ({agree_count/test_length*100:.2f}%)')
    print(f'Discordância parcial: {partial_disagree_count} ({partial_disagree_count/test_length*100:.2f}%)')
    print(f'Discordância: {disagree_count} ({disagree_count/test_length*100:.2f}%)')

    return test_df, agree, partial_disagree, disagree

In [16]:
repo_name = 'bert-base-multilingual-cased-06'
model = BertForSequenceClassification.from_pretrained(f'./trainings/{repo_name}')
tokenizer = BertTokenizer.from_pretrained(f'./trainings/{repo_name}')

In [17]:
%timeit predict_sentiment(test_df.loc[0, 'text'], model)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


75.9 ms ± 3.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
test_df, agree, partial_disagree, disagree = test_model(model, test_df)

{'accuracy': 0.7903800475059383, 'f1': 0.7904945426416597, 'precision': 0.7906228046845624, 'recall': 0.7903800475059383}
Concordância: 1331 (79.04%)
Discordância parcial: 289 (17.16%)
Discordância: 64 (3.80%)


In [19]:
disagree.value_counts(subset=['font', 'lang'])/len(disagree)

font                     lang
financial-phrase-bank    eng     0.250000
                         ptbr    0.234375
InfoMoney                ptbr    0.171875
b3                       ptbr    0.093750
Marília Notícia          ptbr    0.015625
Valor Econômico          ptbr    0.015625
Perfil Multi             ptbr    0.015625
Notícias Concursos       ptbr    0.015625
NBC Chicago              eng     0.015625
Moneycontrol             eng     0.015625
Band Jornalismo          ptbr    0.015625
Benzinga                 eng     0.015625
HYPEBEAST                eng     0.015625
Edital Concursos Brasil  ptbr    0.015625
Common Dreams            eng     0.015625
Cointelegraph            eng     0.015625
CoinDesk                 eng     0.015625
Cageside Seats           eng     0.015625
BleepingComputer         eng     0.015625
Markets Insider          eng     0.015625
dtype: float64

In [20]:
partial_disagree.value_counts(subset=['font', 'lang'])/len(partial_disagree)

font                   lang
financial-phrase-bank  ptbr    0.352941
InfoMoney              ptbr    0.211073
b3                     ptbr    0.100346
financial-phrase-bank  eng     0.051903
Investing.com Brasil   ptbr    0.013841
                                 ...   
Firstpost              eng     0.003460
Fox Business           eng     0.003460
Globo                  ptbr    0.003460
Ache Concursos         ptbr    0.003460
Heritage.org           eng     0.003460
Length: 65, dtype: float64

In [21]:
agree.value_counts(subset=['font', 'lang'])/len(agree)

font                      lang
financial-phrase-bank     ptbr    0.444027
                          eng     0.231405
InfoMoney                 ptbr    0.099925
b3                        ptbr    0.067618
Bloomberg                 eng     0.006011
                                    ...   
Inteligência Financeira   ptbr    0.000751
InsideEVs                 eng     0.000751
Honolulu Star-Advertiser  eng     0.000751
HT Tech                   eng     0.000751
Época NEGÓCIOS            ptbr    0.000751
Length: 130, dtype: float64

# Model pruning

In [22]:
import torch.nn.utils.prune as prune

In [23]:
pruning_rate = 0.2

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name="weight", amount=pruning_rate)
        prune.remove(module, "weight")

In [24]:
model.eval()
model_quantized = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [25]:
model.save_pretrained(f'./trainings/{repo_name}/pruned/')
tokenizer.save_pretrained(f'./trainings/{repo_name}/pruned/')

('./trainings/bert-base-multilingual-cased-06/pruned/tokenizer_config.json',
 './trainings/bert-base-multilingual-cased-06/pruned/special_tokens_map.json',
 './trainings/bert-base-multilingual-cased-06/pruned/vocab.txt',
 './trainings/bert-base-multilingual-cased-06/pruned/added_tokens.json')

In [26]:
def model_size(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

original_size = model_size(model)
quantized_size = model_size(model_quantized)

print(f"Modelo original: {original_size} parâmetros")
print(f"Modelo quantizado: {quantized_size} parâmetros")

Modelo original: 177855747 parâmetros
Modelo quantizado: 92245248 parâmetros


In [27]:
%timeit predict_sentiment(test_df.loc[0, 'text'], model_quantized)

47.2 ms ± 6.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
test_df, agree, partial_disagree, disagree = test_model(model_quantized, test_df)

{'accuracy': 0.744061757719715, 'f1': 0.7450983191393848, 'precision': 0.7514787729617975, 'recall': 0.744061757719715}
Concordância: 1253 (74.41%)
Discordância parcial: 347 (20.61%)
Discordância: 84 (4.99%)


In [29]:
disagree.value_counts(subset=['font', 'lang'])/len(disagree)

font                   lang
financial-phrase-bank  ptbr    0.297619
b3                     ptbr    0.202381
InfoMoney              ptbr    0.130952
financial-phrase-bank  eng     0.119048
Markets Insider        eng     0.011905
Variety                eng     0.011905
Valor Econômico        ptbr    0.011905
The New York Times     eng     0.011905
Portal do Bitcoin      ptbr    0.011905
Notícias Concursos     ptbr    0.011905
NBC Chicago            eng     0.011905
Moneycontrol           eng     0.011905
Marília Notícia        ptbr    0.011905
Bahia Notícias         ptbr    0.011905
Barchart               eng     0.011905
InsideEVs              eng     0.011905
HYPEBEAST              eng     0.011905
Common Dreams          eng     0.011905
Cointelegraph          eng     0.011905
CoinDesk               eng     0.011905
Cageside Seats         eng     0.011905
CBS Boston             eng     0.011905
BleepingComputer       eng     0.011905
Benzinga               eng     0.011905
MarketWatch 

In [30]:
partial_disagree.value_counts(subset=['font', 'lang'])/len(partial_disagree)

font                      lang
financial-phrase-bank     ptbr    0.377522
InfoMoney                 ptbr    0.187320
b3                        ptbr    0.097983
financial-phrase-bank     eng     0.063401
Reuters                   eng     0.014409
                                    ...   
KPRC Click2Houston        eng     0.002882
KTVU FOX 2 San Francisco  eng     0.002882
AOL                       eng     0.002882
Meu Timão                 ptbr    0.002882
Longview News-Journal     eng     0.002882
Length: 75, dtype: float64

In [31]:
agree.value_counts(subset=['font', 'lang'])/len(agree)

font                   lang
financial-phrase-bank  ptbr    0.440543
                       eng     0.245012
InfoMoney              ptbr    0.102953
b3                     ptbr    0.059058
Investing.com Brasil   ptbr    0.007183
                                 ...   
GamesIndustry.biz      eng     0.000798
G1                     ptbr    0.000798
Fortune                eng     0.000798
Forbes Brasil          ptbr    0.000798
Época NEGÓCIOS         ptbr    0.000798
Length: 119, dtype: float64

In [32]:
disconnect()