# Setup inicial

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets transformers huggingface_hub

In [None]:
!pip install accelerate

In [None]:
!pip install wandb

In [5]:
import os
from os import chdir
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datasets import load_metric, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForPreTraining
from transformers import BertForSequenceClassification
from transformers import TFBertModel, BertTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import matplotlib.pyplot as plt
from typing import Dict, Any, Optional
from tqdm.std import tqdm
from google.colab import runtime

In [6]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [7]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
chdir('/content/drive/MyDrive/pantanal.dev/artificial-intelligence')

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [10]:
seed = 42
np.random.seed(seed)

In [11]:
def disconnect():
    runtime.unassign()

In [12]:
def tokenize_function(examples, padding='max_length', truncation=True, max_length=512):
    return tokenizer(examples['text'], padding=padding, truncation=truncation, max_length=max_length)

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
def predict_sentiment(input_text=None):
    input_tokens = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512)

    input_tokens.to(device)

    with torch.no_grad():
        output = model(**input_tokens)
    
    logits = output.logits
    probabilities = F.softmax(logits, dim=-1)
    probabilities_np = probabilities.cpu().numpy()

    formatted_probabilities = np.array2string(probabilities_np, precision=6, suppress_small=True)

    predicted_class_idx = torch.argmax(probabilities, dim=1).item()

    '''
    return {
        'probabilities': formatted_probabilities,
        'predicted_class_idx': predicted_class_idx
    }
    '''

    return predicted_class_idx

# Datasets infomoney + financial-phrase-bank-pt-br

## Carregando datasets e tratando para alimentar o modelo

In [None]:
infomoney_df = pd.read_csv('datasets/infomoney/infomoney_news_labelled_preprocessed.csv')
infomoney_df

In [None]:
smn_df = pd.read_csv('datasets/financial-phrase-bank-pt-br/stock_market_news_labelled.csv')
smn_df

In [None]:
smn_df.columns = infomoney_df.columns
smn_df

In [None]:
df = pd.concat([infomoney_df, smn_df], axis=0)
df = df.reset_index(drop=True)
df

In [None]:
# Dividindo o dataset em conjuntos de treino e teste (80% para treino, 20% para teste)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Dividindo o conjunto de treino em treino e validação (75% para treino, 25% para validação)
train_df, val_df = train_test_split(train_df, test_size=0.25, stratify=train_df['label'], random_state=42)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('trainings/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('trainings/bert-base-portuguese-cased')

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

## Treinamento: bert-base-portuguese-cased-02

In [None]:
repo_name = 'bert-base-portuguese-cased-02'

training_args = TrainingArguments(
    output_dir = f'./trainings/{repo_name}/',
    seed = 42,
    #per_device_train_batch_size = 80,
    #per_device_eval_batch_size = 80,
    auto_find_batch_size = True,
    num_train_epochs = 10,
    weight_decay = 0.01,
    eval_steps = 100,

    logging_steps = 100,
    logging_dir = f'./trainings/{repo_name}/logs',

    save_steps = 500,
    push_to_hub = True,
    push_to_hub_token = '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',

    overwrite_output_dir = True,
    do_eval = True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

/content/drive/.shortcut-targets-by-id/1gDf7FiJm4bOjkVxLViN24B5fuPB9IIHb/IA/./trainings/bert-base-portuguese-cased_02/ is already a clone of https://huggingface.co/TiagoSanti/bert-base-portuguese-cased_02. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()



Step,Training Loss
100,0.6632
200,0.497
300,0.4603
400,0.4954
500,0.341
600,0.3203
700,0.2791
800,0.3128
900,0.2083
1000,0.1756


TrainOutput(global_step=4620, training_loss=0.11008442176613771, metrics={'train_runtime': 3453.5355, 'train_samples_per_second': 10.693, 'train_steps_per_second': 1.338, 'total_flos': 9716778516695040.0, 'train_loss': 0.11008442176613771, 'epoch': 10.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.1755179166793823,
 'eval_accuracy': 0.8555194805194806,
 'eval_f1': 0.855848701037577,
 'eval_precision': 0.8564020463162512,
 'eval_recall': 0.8555194805194806,
 'eval_runtime': 39.4514,
 'eval_samples_per_second': 31.228,
 'eval_steps_per_second': 3.904,
 'epoch': 10.0}

## Teste do modelo

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('trainings/bert-base-portuguese-cased-02')
tokenizer = AutoTokenizer.from_pretrained('trainings/bert-base-portuguese-cased-02')

In [None]:
model = model.to('cpu')

In [None]:
test_dataset = {
    'text': test_dataset['text'],
    'label': test_dataset['label']
}

test_df = pd.DataFrame(test_dataset)
test_df

In [None]:
test_df['pred'] = test_df['text'].apply(predict_sentiment)
test_df

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(test_df['label'], test_df['pred'], average='weighted')
acc = accuracy_score(test_df['label'], test_df['pred'])

print({
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
})

In [None]:
test_df['diff'] = np.abs(test_df['label']-test_df['pred'])
test_df

In [None]:
test_df[test_df['diff']==2]

In [None]:
test_df[test_df['diff']==1]

In [None]:
test_df[test_df['diff']==0]

# Datasets infomoney + financial-phrase-bank-pt-br + financial-phrase-bank-eng-v1

## Carregando datasets e tratando para alimentar o modelo

In [27]:
infomoney_path = 'datasets/infomoney/infomoney_news_labelled_preprocessed.csv'
fpb_pt_path = 'datasets/financial-phrase-bank-pt-br/stock_market_news_labelled.csv'
fpb_eng_path = 'datasets/financial-phrase-bank-eng-v1/financial-phrase-bank-eng.csv'

In [28]:
infomoney_df = pd.read_csv(infomoney_path)
fpb_pt_df = pd.read_csv(fpb_pt_path)
fpb_eng_df = pd.read_csv(fpb_eng_path)

In [29]:
infomoney_df['lang'] = 'ptbr'
infomoney_df

Unnamed: 0,text,label,lang
0,Secretário Fazenda diz medidas governo abrirão...,2,ptbr
1,"Trade hoje : acima 100 mil , Ibovespa segue te...",0,ptbr
2,"alta preços crédito escasso , aumenta busca ca...",1,ptbr
3,"“ menos agora aceno disciplina fiscal ” , diz ...",1,ptbr
4,IR 2023 : Posso ficar declarar bem vendido fal...,1,ptbr
...,...,...,...
1326,Lojas Renner ( LREN3 ) reporta números fracos ...,1,ptbr
1327,Lula volta sinalizar pode discutir autonomia B...,1,ptbr
1328,Lula sobre Campos Neto : posso influir reduzir...,0,ptbr
1329,Embraer ( EMBR3 ) entrega 80 jatos 4º trimestr...,2,ptbr


In [30]:
fpb_pt_df['lang'] = 'ptbr'
fpb_pt_df.columns = ['text', 'label', 'lang']
fpb_pt_df

Unnamed: 0,text,label,lang
0,A Technopolis planeja desenvolver em etapas um...,1,ptbr
1,"A Elcoteq, empresa internacional da indústria ...",0,ptbr
2,Com a nova planta de produção a empresa aument...,2,ptbr
3,De acordo com a estratégia atualizada da empre...,2,ptbr
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,2,ptbr
...,...,...,...
4821,LONDRES MarketWatch - Os preços das ações term...,0,ptbr
4822,"As vendas de cerveja da Rinkuskiai caíram 6,5 ...",1,ptbr
4823,"O lucro operacional caiu para EUR 35,4 milhões...",0,ptbr
4824,As vendas líquidas do segmento de Papel diminu...,0,ptbr


In [31]:
fpb_eng_df['lang'] = 'eng'
fpb_eng_df

Unnamed: 0,text,label,lang
0,"According to Gran , the company has no plans t...",1,eng
1,"For the last quarter of 2010 , Componenta 's n...",2,eng
2,"In the third quarter of 2010 , net sales incre...",2,eng
3,Operating profit rose to EUR 13.1 mn from EUR ...,2,eng
4,"Operating profit totalled EUR 21.1 mn , up fro...",2,eng
...,...,...,...
2259,Operating result for the 12-month period decre...,0,eng
2260,HELSINKI Thomson Financial - Shares in Cargote...,0,eng
2261,LONDON MarketWatch -- Share prices ended lower...,0,eng
2262,Operating profit fell to EUR 35.4 mn from EUR ...,0,eng


In [32]:
df = pd.concat([infomoney_df, fpb_pt_df, fpb_eng_df], axis=0)
df = df.reset_index(drop=True)
df

Unnamed: 0,text,label,lang
0,Secretário Fazenda diz medidas governo abrirão...,2,ptbr
1,"Trade hoje : acima 100 mil , Ibovespa segue te...",0,ptbr
2,"alta preços crédito escasso , aumenta busca ca...",1,ptbr
3,"“ menos agora aceno disciplina fiscal ” , diz ...",1,ptbr
4,IR 2023 : Posso ficar declarar bem vendido fal...,1,ptbr
...,...,...,...
8416,Operating result for the 12-month period decre...,0,eng
8417,HELSINKI Thomson Financial - Shares in Cargote...,0,eng
8418,LONDON MarketWatch -- Share prices ended lower...,0,eng
8419,Operating profit fell to EUR 35.4 mn from EUR ...,0,eng


In [33]:
len(df[df['label']==0]), len(df[df['label']==1]), len(df[df['label']==2]) # negative, neutral, positive

(1306, 4855, 2260)

In [34]:
train_dfs = []
val_dfs = []
test_dfs = []

train_temp_ratio = 0.7
val_test_ratio = 0.5

In [35]:
grouped = df.groupby(['lang', 'label'])

for _, group in grouped:
    # Divide the group into training and temporary sets (validation + test) using stratification
    train_group, temp_group = train_test_split(group, stratify=group["label"], train_size=train_temp_ratio, random_state=seed)
    
    # Divide the temporary set into validation and test sets using stratification
    val_group, test_group = train_test_split(temp_group, stratify=temp_group["label"], test_size=val_test_ratio, random_state=seed)
    
    train_dfs.append(train_group)
    val_dfs.append(val_group)
    test_dfs.append(test_group)

In [36]:
# Combine the train, validation, and test DataFrames
train_df = pd.concat(train_dfs)
val_df = pd.concat(val_dfs)
test_df = pd.concat(test_dfs)

In [37]:
# Train_df info
print(f'Train size: {len(train_df)}\n')
print(f"Train labels ratio:\n{train_df['label'].value_counts()/len(train_df)}\n")
print(f"Train languages ratio:\n{train_df['lang'].value_counts()/len(train_df)}")

Train size: 5893

Train labels ratio:
1    0.576447
2    0.268454
0    0.155099
Name: label, dtype: float64

Train languages ratio:
ptbr    0.731207
eng     0.268793
Name: lang, dtype: float64


In [38]:
# Val_df info
print(f'Validation size: {len(val_df)}\n')
print(f"Validation labels ratio:\n{val_df['label'].value_counts()/len(val_df)}\n")
print(f"Validation languages ratio:\n{val_df['lang'].value_counts()/len(val_df)}")

Validation size: 1262

Validation labels ratio:
1    0.577655
2    0.267829
0    0.154517
Name: label, dtype: float64

Validation languages ratio:
ptbr    0.731379
eng     0.268621
Name: lang, dtype: float64


In [39]:
# Test_df info
print(f'Test size: {len(test_df)}\n')
print(f"Test labels ratio:\n{test_df['label'].value_counts()/len(test_df)}\n")
print(f"Test languages ratio:\n{test_df['lang'].value_counts()/len(test_df)}")

Test size: 1266

Test labels ratio:
1    0.575829
2    0.268562
0    0.155608
Name: label, dtype: float64

Test languages ratio:
ptbr    0.730648
eng     0.269352
Name: lang, dtype: float64


In [40]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## Treinamento: bert-base-multilingual-cased-02

In [41]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [42]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [43]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5893 [00:00<?, ? examples/s]

Map:   0%|          | 0/1262 [00:00<?, ? examples/s]

Map:   0%|          | 0/1266 [00:00<?, ? examples/s]

In [44]:
repo_name = 'bert-base-multilingual-cased-03'

training_args = TrainingArguments(
    output_dir=f'./trainings/{repo_name}/',
    seed=seed,
    auto_find_batch_size=True,
    num_train_epochs=12,
    learning_rate=5e-6, # default 5e-5
    weight_decay=0.01,
    eval_steps=100,
    logging_steps=100,
    save_steps=1000,
    save_strategy="steps",
    evaluation_strategy="steps",
    report_to="wandb",
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [45]:
trainer.train()
trainer.save_model(f'./trainings/{repo_name}/')

[34m[1mwandb[0m: Currently logged in as: [33mtiagosanti[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.938,0.833209,0.64897,0.564455,0.52993,0.64897
200,0.7507,0.730104,0.669572,0.618023,0.604795,0.669572
300,0.714,0.678589,0.683043,0.639187,0.652659,0.683043
400,0.71,0.649928,0.711569,0.700005,0.704332,0.711569
500,0.6157,0.610115,0.733756,0.738053,0.753871,0.733756
600,0.5917,0.5523,0.765452,0.752742,0.759556,0.765452
700,0.5522,0.528879,0.763867,0.771239,0.800593,0.763867
800,0.5025,0.523925,0.788431,0.787921,0.801206,0.788431
900,0.4432,0.505155,0.803487,0.80494,0.808148,0.803487
1000,0.4481,0.486752,0.805071,0.80492,0.80666,0.805071


In [46]:
wandb.finish()

0,1
eval/accuracy,▁▂▄▅▇▇▇▇▇▇▇▇███▇██▇████▇██▇▇███████▇████
eval/f1,▁▃▅▆▇▇▇▇▇▇████████████████▇█████████████
eval/loss,▅▃▂▂▁▁▁▁▂▁▂▃▃▃▃▄▃▄▅▄▅▅▅▇▆▆▇▇▆▆▇▇▇▇▇█████
eval/precision,▁▄▆▇▇▇▇██▇██████████████████████████████
eval/recall,▁▂▄▅▇▇▇▇▇▇▇▇███▇██▇████▇██▇▇███████▇████
eval/runtime,▃▂▁▁█▁▂▂▁▁▁▂▂▂▂▁▁▂▁▂▂▂▂▂▂▂▂▁▂▂▁▂▂▂▂▂▂▂▂▂
eval/samples_per_second,▆▇██▁▇▇▇███▇▇▇▇██▇█▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇
eval/steps_per_second,▆▇██▁▇▇▇██▇▇▇▇▇██▇█▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.83756
eval/f1,0.83918
eval/loss,1.17221
eval/precision,0.8451
eval/recall,0.83756
eval/runtime,39.7399
eval/samples_per_second,31.756
eval/steps_per_second,3.976
train/epoch,12.0
train/global_step,8844.0


## Teste do modelo

In [47]:
model = BertForSequenceClassification.from_pretrained('trainings/bert-base-multilingual-cased-03')
tokenizer = BertTokenizer.from_pretrained('trainings/bert-base-multilingual-cased-03')

In [48]:
test_df = pd.DataFrame({
    'text': test_dataset['text'],
    'lang': test_dataset['lang'],
    'label': test_dataset['label']
})

model.to(device)
test_df['pred'] = test_df['text'].apply(predict_sentiment)
test_df

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Unnamed: 0,text,lang,label,pred
0,Revenue in the quarter fell 8 percent to ( EUR...,eng,0,0
1,Raute reported a loss per share of EUR0 .86 fo...,eng,0,0
2,Net sales decreased to EUR 220.5 mn from EUR 4...,eng,0,0
3,Finnish electronics contract manufacturer Scan...,eng,0,0
4,Insurer Axa ( PAR : CS ) slid by 5.35 % to EUR...,eng,0,0
...,...,...,...,...
1261,A finlandesa Ahlstrom Corporation dobrará seus...,ptbr,2,2
1262,Sponda Plc Stock Exchange Release 5 de dezembr...,ptbr,2,2
1263,"""Depois de um longo período não lucrativo, a D...",ptbr,2,2
1264,"A Svyturys-Utenos Alus, que é controlada pelo ...",ptbr,2,2


In [49]:
precision, recall, f1, _ = precision_recall_fscore_support(test_df['label'], test_df['pred'], average='weighted')
acc = accuracy_score(test_df['label'], test_df['pred'])

print({
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
})

{'accuracy': 0.8412322274881516, 'f1': 0.8420010236754154, 'precision': 0.8439597596435783, 'recall': 0.8412322274881516}


In [50]:
test_df['diff'] = np.abs(test_df['label']-test_df['pred'])
test_df

Unnamed: 0,text,lang,label,pred,diff
0,Revenue in the quarter fell 8 percent to ( EUR...,eng,0,0,0
1,Raute reported a loss per share of EUR0 .86 fo...,eng,0,0,0
2,Net sales decreased to EUR 220.5 mn from EUR 4...,eng,0,0,0
3,Finnish electronics contract manufacturer Scan...,eng,0,0,0
4,Insurer Axa ( PAR : CS ) slid by 5.35 % to EUR...,eng,0,0,0
...,...,...,...,...,...
1261,A finlandesa Ahlstrom Corporation dobrará seus...,ptbr,2,2,0
1262,Sponda Plc Stock Exchange Release 5 de dezembr...,ptbr,2,2,0
1263,"""Depois de um longo período não lucrativo, a D...",ptbr,2,2,0
1264,"A Svyturys-Utenos Alus, que é controlada pelo ...",ptbr,2,2,0


In [51]:
test_length = test_df.shape[0]
agree_count = test_df[test_df["diff"]==0].shape[0]
partial_disagree_count = test_df[test_df["diff"]==1].shape[0]
disagree_count = test_df[test_df["diff"]==2].shape[0]

print(f'Concordância: {agree_count} ({agree_count/test_length*100:.2f}%)')
print(f'Discordância parcial: {partial_disagree_count} ({partial_disagree_count/test_length*100:.2f}%)')
print(f'Discordância: {disagree_count} ({disagree_count/test_length*100:.2f}%)')

Concordância: 1065 (84.12%)
Discordância parcial: 181 (14.30%)
Discordância: 20 (1.58%)


In [52]:
test_df[test_df['diff']==2]

Unnamed: 0,text,lang,label,pred,diff
26,The company said that sales in the three month...,eng,0,2,2
259,Finnish flexible packaging manufacturer Suomin...,eng,2,0,2
316,Return on equity stood at 18.3 % compared to 1...,eng,2,0,2
336,"Operating loss was EUR 179mn , compared to a l...",eng,2,0,2
385,"O lucro do período totalizou 0,8 milhões de eu...",ptbr,0,2,2
404,O valor justo das propriedades de investimento...,ptbr,0,2,2
424,Novo golpe usa ChatGPT isca promessa lucro ráp...,ptbr,0,2,2
427,"O retorno sobre o investimento ROI foi de 4,1%...",ptbr,0,2,2
443,"`` O ajuste para a queda no nível de preços, e...",ptbr,0,2,2
445,O justo valor das propriedades de investimento...,ptbr,0,2,2


In [53]:
test_df[test_df['diff']==1]

Unnamed: 0,text,lang,label,pred,diff
61,U.S.-based T Corp. is in talks with Scandinavi...,eng,1,2,1
93,`` The industry is coming to an interesting fo...,eng,1,2,1
133,3 January 2011 - Scandinavian lenders Sampo Ba...,eng,1,2,1
187,Den Bosch-based TomTom is Europe 's largest ma...,eng,1,2,1
234,The repo rate will gradually reach 2 % at the ...,eng,1,2,1
...,...,...,...,...,...
1236,"A instalação planejada, estimada em cerca de U...",ptbr,2,1,1
1237,Governo define reposição R $ 30 bi reforçar ca...,ptbr,2,1,1
1242,O corretor iniciou a Palfinger AG e a Konecran...,ptbr,2,1,1
1252,A femto-célula UMTS Home Base Station da Airva...,ptbr,2,1,1


In [54]:
test_df[test_df['diff']==0]

Unnamed: 0,text,lang,label,pred,diff
0,Revenue in the quarter fell 8 percent to ( EUR...,eng,0,0,0
1,Raute reported a loss per share of EUR0 .86 fo...,eng,0,0,0
2,Net sales decreased to EUR 220.5 mn from EUR 4...,eng,0,0,0
3,Finnish electronics contract manufacturer Scan...,eng,0,0,0
4,Insurer Axa ( PAR : CS ) slid by 5.35 % to EUR...,eng,0,0,0
...,...,...,...,...,...
1261,A finlandesa Ahlstrom Corporation dobrará seus...,ptbr,2,2,0
1262,Sponda Plc Stock Exchange Release 5 de dezembr...,ptbr,2,2,0
1263,"""Depois de um longo período não lucrativo, a D...",ptbr,2,2,0
1264,"A Svyturys-Utenos Alus, que é controlada pelo ...",ptbr,2,2,0


In [55]:
test_df[test_df['diff']==2].to_csv('disagree.csv')
disconnect()