In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from os import chdir

chdir('/content/drive/MyDrive/pantanal.dev/artificial-intelligence/')

# Financial Phrase Bank Portuguese Translation
link: https://www.kaggle.com/datasets/mateuspicanco/financial-phrase-bank-portuguese-translation

In [51]:
raw_fpb_ptbr_df = pd.read_csv('datasets/financial-phrase-bank/financial-phrase-bank-ptbr.csv')

In [52]:
# Visualizando as dimensões do df
raw_fpb_ptbr_df.shape

(4845, 3)

In [53]:
# Visualizando a quantidade de exemplos de cada classe
raw_fpb_ptbr_df.loc[:, 'y'].value_counts()

neutral     2878
positive    1363
negative     604
Name: y, dtype: int64

In [54]:
raw_fpb_ptbr_df

Unnamed: 0,y,text,text_pt
0,neutral,Technopolis plans to develop in stages an area...,A Technopolis planeja desenvolver em etapas um...
1,negative,The international electronic industry company ...,"A Elcoteq, empresa internacional da indústria ..."
2,positive,With the new production plant the company woul...,Com a nova planta de produção a empresa aument...
3,positive,According to the company 's updated strategy f...,De acordo com a estratégia atualizada da empre...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...
...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,LONDRES MarketWatch - Os preços das ações term...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,"As vendas de cerveja da Rinkuskiai caíram 6,5 ..."
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,"O lucro operacional caiu para EUR 35,4 milhões..."
4843,negative,Net sales of the Paper segment decreased to EU...,As vendas líquidas do segmento de Papel diminu...


In [55]:
# Copiando df para evitar novos carregamentos após modificações
fpb_ptbr_df = raw_fpb_ptbr_df.copy()

In [56]:
# Invertendo a ordem das colunas para visualização 'exemplo: classe'
fpb_ptbr_df = fpb_ptbr_df[['text_pt', 'y']]
fpb_ptbr_df.columns = ['text', 'label']
fpb_ptbr_df

Unnamed: 0,text,label
0,A Technopolis planeja desenvolver em etapas um...,neutral
1,"A Elcoteq, empresa internacional da indústria ...",negative
2,Com a nova planta de produção a empresa aument...,positive
3,De acordo com a estratégia atualizada da empre...,positive
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,positive
...,...,...
4840,LONDRES MarketWatch - Os preços das ações term...,negative
4841,"As vendas de cerveja da Rinkuskiai caíram 6,5 ...",neutral
4842,"O lucro operacional caiu para EUR 35,4 milhões...",negative
4843,As vendas líquidas do segmento de Papel diminu...,negative


In [57]:
def encode_classes(class_name):
    if class_name == 'positive':
        return 2
    if class_name == 'neutral':
        return 1
    if class_name == 'negative':
        return 0

In [58]:
fpb_ptbr_df['label'] = fpb_ptbr_df['label'].apply(encode_classes)
fpb_ptbr_df

Unnamed: 0,text,label
0,A Technopolis planeja desenvolver em etapas um...,1
1,"A Elcoteq, empresa internacional da indústria ...",0
2,Com a nova planta de produção a empresa aument...,2
3,De acordo com a estratégia atualizada da empre...,2
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,2
...,...,...
4840,LONDRES MarketWatch - Os preços das ações term...,0
4841,"As vendas de cerveja da Rinkuskiai caíram 6,5 ...",1
4842,"O lucro operacional caiu para EUR 35,4 milhões...",0
4843,As vendas líquidas do segmento de Papel diminu...,0


In [59]:
fpb_ptbr_df = fpb_ptbr_df.drop_duplicates(subset='text')
fpb_ptbr_df

Unnamed: 0,text,label
0,A Technopolis planeja desenvolver em etapas um...,1
1,"A Elcoteq, empresa internacional da indústria ...",0
2,Com a nova planta de produção a empresa aument...,2
3,De acordo com a estratégia atualizada da empre...,2
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,2
...,...,...
4840,LONDRES MarketWatch - Os preços das ações term...,0
4841,"As vendas de cerveja da Rinkuskiai caíram 6,5 ...",1
4842,"O lucro operacional caiu para EUR 35,4 milhões...",0
4843,As vendas líquidas do segmento de Papel diminu...,0


In [60]:
fpb_ptbr_df.isna().sum()

text     0
label    0
dtype: int64

In [61]:
fpb_ptbr_df.drop_duplicates(subset='text', inplace=True)
fpb_ptbr_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fpb_ptbr_df.drop_duplicates(subset='text', inplace=True)


Unnamed: 0,text,label
0,A Technopolis planeja desenvolver em etapas um...,1
1,"A Elcoteq, empresa internacional da indústria ...",0
2,Com a nova planta de produção a empresa aument...,2
3,De acordo com a estratégia atualizada da empre...,2
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,2
...,...,...
4840,LONDRES MarketWatch - Os preços das ações term...,0
4841,"As vendas de cerveja da Rinkuskiai caíram 6,5 ...",1
4842,"O lucro operacional caiu para EUR 35,4 milhões...",0
4843,As vendas líquidas do segmento de Papel diminu...,0


In [62]:
fpb_ptbr_df['label'].value_counts()

1    2860
2    1362
0     604
Name: label, dtype: int64

In [63]:
fpb_ptbr_df.to_csv('datasets/financial-phrase-bank/financial-phrase-bank-ptbr-labelled.csv', sep='|')

# Financial Phrase Bank English

Malo, Pekka & Sinha, Ankur & Takala, Pyry & Korhonen, Pekka & Wallenius, Jyrki. (2013). FinancialPhraseBank-v1.0.

https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10

In [16]:
with open('datasets/financial-phrase-bank/financial-phrase-bank-eng.txt', 'r', encoding='latin-1') as file:
    lines = file.readlines()

In [17]:
lines

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral\n',
 "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .@positive\n",
 'In the third quarter of 2010 , net sales increased by 5.2 % to EUR 205.5 mn , and operating profit by 34.9 % to EUR 23.5 mn .@positive\n',
 'Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales .@positive\n',
 'Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales .@positive\n',
 'Finnish Talentum reports its operating profit increased to EUR 20.5 mn in 2005 from EUR 9.3 mn in 2004 , and net sales totaled EUR 103.3 mn , up from EUR 96.4 mn .@positive\n',
 "Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2

In [23]:
data = []
for line in lines:
    text, label = line.split('@')
    label = label.replace('\n', '')
    data.append({'text': text, 'label': label})
fpb_eng_df = pd.DataFrame(data)

In [24]:
fpb_eng_df

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",neutral
1,"For the last quarter of 2010 , Componenta 's n...",positive
2,"In the third quarter of 2010 , net sales incre...",positive
3,Operating profit rose to EUR 13.1 mn from EUR ...,positive
4,"Operating profit totalled EUR 21.1 mn , up fro...",positive
...,...,...
2259,Operating result for the 12-month period decre...,negative
2260,HELSINKI Thomson Financial - Shares in Cargote...,negative
2261,LONDON MarketWatch -- Share prices ended lower...,negative
2262,Operating profit fell to EUR 35.4 mn from EUR ...,negative


In [25]:
label_encode = {
    'positive': 2,
    'neutral': 1,
    'negative': 0
}

fpb_eng_df['label'] = fpb_eng_df['label'].apply(lambda label: label_encode[label])
fpb_eng_df

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta 's n...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2
...,...,...
2259,Operating result for the 12-month period decre...,0
2260,HELSINKI Thomson Financial - Shares in Cargote...,0
2261,LONDON MarketWatch -- Share prices ended lower...,0
2262,Operating profit fell to EUR 35.4 mn from EUR ...,0


In [27]:
fpb_eng_df.drop_duplicates(subset='text', ignore_index=True, inplace=True)
fpb_eng_df

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta 's n...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2
...,...,...
2254,Operating result for the 12-month period decre...,0
2255,HELSINKI Thomson Financial - Shares in Cargote...,0
2256,LONDON MarketWatch -- Share prices ended lower...,0
2257,Operating profit fell to EUR 35.4 mn from EUR ...,0


In [29]:
fpb_eng_df.isna().sum()

text     0
label    0
dtype: int64

In [30]:
fpb_eng_df['label'].value_counts()

1    1386
2     570
0     303
Name: label, dtype: int64

In [44]:
fpb_eng_df.to_csv('datasets/financial-phrase-bank/financial-phrase-bank-eng-labelled.csv', sep='|')

# Concatenar datasets

In [70]:
fpb_ptbr_df = pd.read_csv('datasets/financial-phrase-bank/financial-phrase-bank-ptbr-labelled.csv', sep='|', index_col=0)
fpb_ptbr_df['lang'] = 'ptbr'

fpb_eng_df = pd.read_csv('datasets/financial-phrase-bank/financial-phrase-bank-eng-labelled.csv', sep='|', index_col=0)
fpb_eng_df['lang'] = 'eng'

In [71]:
fpb_ptbr_df

Unnamed: 0,text,label,lang
0,A Technopolis planeja desenvolver em etapas um...,1,ptbr
1,"A Elcoteq, empresa internacional da indústria ...",0,ptbr
2,Com a nova planta de produção a empresa aument...,2,ptbr
3,De acordo com a estratégia atualizada da empre...,2,ptbr
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,2,ptbr
...,...,...,...
4840,LONDRES MarketWatch - Os preços das ações term...,0,ptbr
4841,"As vendas de cerveja da Rinkuskiai caíram 6,5 ...",1,ptbr
4842,"O lucro operacional caiu para EUR 35,4 milhões...",0,ptbr
4843,As vendas líquidas do segmento de Papel diminu...,0,ptbr


In [72]:
fpb_eng_df

Unnamed: 0,text,label,lang
0,"According to Gran , the company has no plans t...",1,eng
1,"For the last quarter of 2010 , Componenta 's n...",2,eng
2,"In the third quarter of 2010 , net sales incre...",2,eng
3,Operating profit rose to EUR 13.1 mn from EUR ...,2,eng
4,"Operating profit totalled EUR 21.1 mn , up fro...",2,eng
...,...,...,...
2254,Operating result for the 12-month period decre...,0,eng
2255,HELSINKI Thomson Financial - Shares in Cargote...,0,eng
2256,LONDON MarketWatch -- Share prices ended lower...,0,eng
2257,Operating profit fell to EUR 35.4 mn from EUR ...,0,eng


In [73]:
fpb_df = pd.concat([fpb_ptbr_df, fpb_eng_df], ignore_index=True)
fpb_df

Unnamed: 0,text,label,lang
0,A Technopolis planeja desenvolver em etapas um...,1,ptbr
1,"A Elcoteq, empresa internacional da indústria ...",0,ptbr
2,Com a nova planta de produção a empresa aument...,2,ptbr
3,De acordo com a estratégia atualizada da empre...,2,ptbr
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,2,ptbr
...,...,...,...
7080,Operating result for the 12-month period decre...,0,eng
7081,HELSINKI Thomson Financial - Shares in Cargote...,0,eng
7082,LONDON MarketWatch -- Share prices ended lower...,0,eng
7083,Operating profit fell to EUR 35.4 mn from EUR ...,0,eng


In [74]:
fpb_df['font'] = 'financial-phrase-bank'
fpb_df

Unnamed: 0,text,label,lang,font
0,A Technopolis planeja desenvolver em etapas um...,1,ptbr,financial-phrase-bank
1,"A Elcoteq, empresa internacional da indústria ...",0,ptbr,financial-phrase-bank
2,Com a nova planta de produção a empresa aument...,2,ptbr,financial-phrase-bank
3,De acordo com a estratégia atualizada da empre...,2,ptbr,financial-phrase-bank
4,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...,2,ptbr,financial-phrase-bank
...,...,...,...,...
7080,Operating result for the 12-month period decre...,0,eng,financial-phrase-bank
7081,HELSINKI Thomson Financial - Shares in Cargote...,0,eng,financial-phrase-bank
7082,LONDON MarketWatch -- Share prices ended lower...,0,eng,financial-phrase-bank
7083,Operating profit fell to EUR 35.4 mn from EUR ...,0,eng,financial-phrase-bank


In [75]:
fpb_df['label'].value_counts()

1    4246
2    1932
0     907
Name: label, dtype: int64

In [78]:
fpb_df.to_csv('datasets/financial-phrase-bank/financial-phrase-bank.csv', sep='|')