In [1]:
!pip install yfinance


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# POC - Descobrir os componentes financeiros com forte correlação com "Dividend"

In [39]:
import pandas as pd
import yfinance as yf

# Escolha do ativo, por exemplo, 'PETR4.SA' para Petrobras
ticker = 'PETR4.SA'

# Download dos dados do ativo
stock = yf.Ticker(ticker)

# Coletando dados financeiros e históricos de dividendos
dividends = stock.dividends
financials = stock.financials.transpose()
balance_sheet = stock.balance_sheet.transpose()
cash_flow = stock.cashflow.transpose()

# Agrupar dividendos por ano
dividends = dividends.to_frame().reset_index()
grouped_dividends = dividends.groupby(pd.Grouper(key='Date', freq='Y')).sum()
grouped_dividends.index = grouped_dividends.index.tz_localize(None)

# Consolidando todos os dados em um único DataFrame
data_frames = [financials, balance_sheet, cash_flow]
df = pd.concat(data_frames, axis=1)

# Tratar dados ausentes
df.fillna(inplace=True, value=0)

# Juntar "grouped_dividends" com "df" apenas onde "df" possui a data (index)
df = df.merge(grouped_dividends, left_index=True, right_index=True)

df

Unnamed: 0,Tax Effect Of Unusual Items,Tax Rate For Calcs,Normalized EBITDA,Total Unusual Items,Total Unusual Items Excluding Goodwill,Net Income From Continuing Operation Net Minority Interest,Reconciled Depreciation,Reconciled Cost Of Revenue,EBITDA,EBIT,...,Depreciation And Amortization,Amortization Cash Flow,Amortization Of Intangibles,Depreciation,Operating Gains Losses,Pension And Employee Benefit Expense,Earnings Losses From Equity Investments,Net Foreign Currency Exchange Gain Loss,Net Income From Continuing Operations,Dividends
2023-12-31,-273420000.0,0.294,53226000000.0,-930000000.0,-930000000.0,24884000000.0,13280000000.0,35887000000.0,52296000000.0,39016000000.0,...,13280000000.0,104000000.0,104000000.0,13176000000.0,3049000000.0,1542000000.0,304000000.0,2498000000.0,24995000000.0,7.344291
2022-12-31,630382000.0,0.313,68040000000.0,2014000000.0,2014000000.0,36623000000.0,13218000000.0,47162000000.0,70054000000.0,56836000000.0,...,13218000000.0,77000000.0,77000000.0,13141000000.0,4390000000.0,1228000000.0,-251000000.0,4557000000.0,36755000000.0,15.102379
2021-12-31,-397281800.0,0.291904,46258000000.0,-1361000000.0,-1361000000.0,19875000000.0,11695000000.0,32165000000.0,44897000000.0,33202000000.0,...,11695000000.0,60000000.0,60000000.0,11635000000.0,9386000000.0,2098000000.0,-1607000000.0,10795000000.0,19986000000.0,5.653232
2020-12-31,-4959920000.0,0.34,31578000000.0,-14588000000.0,-14588000000.0,1141000000.0,11445000000.0,18403000000.0,16990000000.0,5545000000.0,...,11445000000.0,66000000.0,66000000.0,11379000000.0,10296000000.0,-1001000000.0,659000000.0,11094000000.0,948000000.0,0.000461
2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.942361


In [40]:
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,Tax Effect Of Unusual Items,Tax Rate For Calcs,Normalized EBITDA,Total Unusual Items,Total Unusual Items Excluding Goodwill,Net Income From Continuing Operation Net Minority Interest,Reconciled Depreciation,Reconciled Cost Of Revenue,EBITDA,EBIT,...,Depreciation And Amortization,Amortization Cash Flow,Amortization Of Intangibles,Depreciation,Operating Gains Losses,Pension And Employee Benefit Expense,Earnings Losses From Equity Investments,Net Foreign Currency Exchange Gain Loss,Net Income From Continuing Operations,Dividends
Tax Effect Of Unusual Items,1.000000,-0.354871,0.225622,0.999791,0.999791,0.600973,-0.140021,0.303434,0.443105,0.556589,...,-0.140021,-0.069970,-0.069970,-0.140445,-0.658347,0.739806,-0.482524,-0.629457,0.604701,0.630905
Tax Rate For Calcs,-0.354871,1.000000,0.819777,-0.355527,-0.355527,0.512450,0.970389,0.774109,0.670132,0.562721,...,0.970389,0.867582,0.867582,0.970699,0.736551,0.215986,-0.025653,0.686810,0.508995,0.396367
Normalized EBITDA,0.225622,0.819777,1.000000,0.227692,0.227692,0.911382,0.916825,0.996487,0.973316,0.933583,...,0.916825,0.862390,0.862390,0.916825,0.312328,0.594271,-0.209429,0.272426,0.909650,0.837803
Total Unusual Items,0.999791,-0.355527,0.227692,1.000000,1.000000,0.603936,-0.141453,0.305323,0.445057,0.559234,...,-0.141453,-0.072455,-0.072455,-0.141869,-0.663087,0.730595,-0.473175,-0.634427,0.607612,0.638472
Total Unusual Items Excluding Goodwill,0.999791,-0.355527,0.227692,1.000000,1.000000,0.603936,-0.141453,0.305323,0.445057,0.559234,...,-0.141453,-0.072455,-0.072455,-0.141869,-0.663087,0.730595,-0.473175,-0.634427,0.607612,0.638472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pension And Employee Benefit Expense,0.739806,0.215986,0.594271,0.730595,0.730595,0.758965,0.404556,0.648078,0.718697,0.753663,...,0.404556,0.402678,0.402678,0.404404,-0.068636,1.000000,-0.729734,-0.055829,0.761672,0.635046
Earnings Losses From Equity Investments,-0.482524,-0.025653,-0.209429,-0.473175,-0.473175,-0.329633,-0.084528,-0.263357,-0.304111,-0.339361,...,-0.084528,0.107839,0.107839,-0.085813,-0.239860,-0.729734,1.000000,-0.303357,-0.332359,-0.259447
Net Foreign Currency Exchange Gain Loss,-0.629457,0.686810,0.272426,-0.634427,-0.634427,-0.068326,0.530510,0.228507,0.101031,-0.005854,...,0.530510,0.280325,0.280325,0.532010,0.995953,-0.055829,-0.303357,1.000000,-0.071004,-0.165565
Net Income From Continuing Operations,0.604701,0.508995,0.909650,0.607612,0.607612,0.999987,0.677263,0.938834,0.979756,0.997739,...,0.677263,0.664582,0.664582,0.677075,-0.049193,0.761672,-0.332359,-0.071004,1.000000,0.967158


In [48]:
# Mostrar os atributos com correlação significativa com "Dividend"
dividend_corr = correlation_matrix['Dividends'].dropna()
dividend_features = dividend_corr[(abs(dividend_corr) >= 0.7) & (abs(dividend_corr) < 1.0)]
dividend_features

Normalized EBITDA                                             0.837803
Net Income From Continuing Operation Net Minority Interest    0.967344
Reconciled Cost Of Revenue                                    0.870373
EBITDA                                                        0.920954
EBIT                                                          0.957789
Normalized Income                                             0.916749
Net Income From Continuing And Discontinued Operation         0.967344
Total Expenses                                                0.808004
Diluted EPS                                                   0.967533
Basic EPS                                                     0.967533
Diluted NI Availto Com Stockholders                           0.967344
Net Income Common Stockholders                                0.967344
Net Income                                                    0.967344
Minority Interests                                           -0.759764
Net In

# Algoritmo para descobrir os componentes financeiros com maior correlação com "Dividend"

In [53]:
import yfinance as yf
import pandas as pd

# Lista de ativos
ativos = {
    'VAMO3': 'VAMO3.SA',
    'VBBR3': 'VBBR3.SA',
    'VIVA3': 'VIVA3.SA',
    'VIVT3': 'VIVT3.SA',
    'VLID3': 'VLID3.SA',
    'VULC3': 'VULC3.SA',
    'WEGE3': 'WEGE3.SA',
    'WIZC3': 'WIZC3.SA',
    'YDUQ3': 'YDUQ3.SA',
    'ZAMP3': 'ZAMP3.SA'
}

# Dicionário para armazenar correlações
correlations = []

# Função para coletar dados e calcular correlação
def get_correlations(ticker):
    stock = yf.Ticker(ticker)

    # Coletando dados financeiros e históricos de dividendos
    dividends = stock.dividends
    financials = stock.financials.transpose()
    balance_sheet = stock.balance_sheet.transpose()
    cash_flow = stock.cashflow.transpose()

    # Agrupar dividendos por ano
    dividends = dividends.to_frame().reset_index()
    grouped_dividends = dividends.groupby(pd.Grouper(key='Date', freq='Y')).sum()
    grouped_dividends.index = grouped_dividends.index.tz_localize(None)

    # Consolidando todos os dados em um único DataFrame
    data_frames = [financials, balance_sheet, cash_flow]
    df = pd.concat(data_frames, axis=1)

    # Tratar dados ausentes
    df.fillna(inplace=True, value=0)

    # Juntar "grouped_dividends" com "df" apenas onde "df" possui a data (index)
    df = df.merge(grouped_dividends, left_index=True, right_index=True)

    # Calculando a correlação
    correlation_matrix = df.corr()
    
    # Retornar apenas a coluna 'Dividends' da correlação
    if 'Dividends' in correlation_matrix:
        dividend_corr = correlation_matrix['Dividends'].dropna()
        return dividend_corr
    else:
        return None

# Coletar correlações para todos os ativos
for nome, ticker in ativos.items():
    corr = get_correlations(ticker)
    if corr is not None:
        correlations.append(corr)

# Convertendo a lista de correlações em um DataFrame
correlation_df = pd.DataFrame(correlations)

# Calculando a média das correlações
mean_correlation = correlation_df.median().dropna()

# Filtrando os componentes financeiros com forte correlação
strong_correlations = mean_correlation[(abs(mean_correlation) >= 0.7) & (abs(mean_correlation) < 1.0)]

In [54]:
strong_correlations

Finished Goods                                       0.839278
Depreciation Income Statement                       -0.703512
Pension And Employee Benefit Expense                 0.785728
Line Of Credit                                      -0.762415
Amortization Cash Flow                              -0.736729
Investments In Other Ventures Under Equity Method    0.742601
Sale Of Intangibles                                  0.894391
dtype: float64

In [64]:
import pickle

with open("data/dividend_features.pkl", "wb") as file:
    pickle.dump(strong_correlations.index.to_list(), file)