In [1]:
import pickle
from pathlib import Path
from pprint import pprint
from typing import List

import pandas as pd
import yfinance as yf

In [5]:
import yfinance as yf
import pandas as pd

def get_correlations(ticker: str) -> pd.DataFrame:
    stock = yf.Ticker(ticker)

    # Coletando dados financeiros e históricos de dividendos
    dividends = stock.dividends
    financials = stock.financials.transpose()
    balance_sheet = stock.balance_sheet.transpose()
    cash_flow = stock.cashflow.transpose()

    # Agrupar dividendos por ano
    dividends = dividends.to_frame().reset_index()
    grouped_dividends = dividends.groupby(pd.Grouper(key='Date', freq='YE')).sum()
    grouped_dividends.index = grouped_dividends.index.tz_localize(None)

    # Consolidando todos os dados em um único DataFrame
    data_frames = [financials, balance_sheet, cash_flow]
    df = pd.concat(data_frames, axis=1)

    # Converter colunas para tipos numéricos, ignorando erros
    df = df.apply(pd.to_numeric, errors='coerce')

    # Tratar dados ausentes com interpolação
    df.interpolate(method='linear', limit_direction='both', inplace=True)
    df = df.infer_objects(copy=False)

    # Juntar "grouped_dividends" com "df" apenas onde "df" possui a data (index)
    df = df.merge(grouped_dividends, left_index=True, right_index=True)

    # Calculando a correlação
    correlation_matrix = df.corr()
    
    if 'Dividends' not in correlation_matrix:
        return pd.DataFrame()
    
    dividend_corr = correlation_matrix['Dividends'].dropna()
    dividend_corr.name = ticker
    return dividend_corr.to_frame().transpose()


In [15]:
def get_strong_features_for_dividends(tickers: List[str]) -> pd.DataFrame:
    all_correlations = pd.DataFrame()

    # Coletar correlações para todos os ativos
    for ticker in tickers:
        corr = get_correlations(ticker)

        if corr is not None:
            all_correlations = pd.concat([all_correlations, corr], axis=0)

    # Preencher valores ausentes com 0
    all_correlations.fillna(0, inplace=True)
    
    # Calculando a média das correlações
    mean_correlation = all_correlations.median().dropna()

    # Filtrando os componentes financeiros com forte correlação
    strong_correlations = mean_correlation[(mean_correlation >= 0.7) & (mean_correlation < 1.0)]
    
    return strong_correlations

# Testing

In [7]:
get_correlations("VALE3.SA")

Unnamed: 0,Tax Effect Of Unusual Items,Tax Rate For Calcs,Normalized EBITDA,Total Unusual Items,Total Unusual Items Excluding Goodwill,Net Income From Continuing Operation Net Minority Interest,Reconciled Depreciation,Reconciled Cost Of Revenue,EBITDA,EBIT,...,Other Non Cash Items,Provisionand Write Offof Assets,Asset Impairment Charge,Depreciation Amortization Depletion,Operating Gains Losses,Earnings Losses From Equity Investments,Gain Loss On Investment Securities,Net Foreign Currency Exchange Gain Loss,Net Income From Continuing Operations,Dividends
VALE3.SA,0.87849,0.175742,0.878895,0.833997,0.833997,0.980163,-0.768362,0.507079,0.99093,0.991605,...,-0.946373,-0.524193,-0.484707,-0.768362,-0.574595,0.003931,-0.421675,-0.499534,0.989207,1.0


In [16]:
tickers = ['VAMO3.SA', 'VIVT3.SA', 'VULC3.SA', 'YDUQ3.SA', 'ZAMP3.SA']

get_strong_features_for_dividends(tickers)

EBIT                                                0.765349
Pretax Income                                       0.756286
Working Capital                                     0.716234
Common Stock Equity                                 0.775856
Total Equity Gross Minority Interest                0.767623
Stockholders Equity                                 0.775856
Tradeand Other Payables Non Current                 0.787373
Cash Cash Equivalents And Short Term Investments    0.756647
Cash And Cash Equivalents                           0.901718
Cash Equivalents                                    0.866884
Issuance Of Capital Stock                           0.712075
End Cash Position                                   0.901718
Changes In Cash                                     0.852452
Common Stock Issuance                               0.712075
Investing Cash Flow                                 0.729320
Net Income From Continuing Operations               0.756286
Rent Expense Supplementa

In [18]:
with open("data/dividend_clusters.pkl", mode="rb") as file:
    dividend_clusters = pickle.load(file)
    pprint(dividend_clusters)

{0: ['VAMO3.SA', 'VIVT3.SA', 'VULC3.SA', 'YDUQ3.SA', 'ZAMP3.SA'],
 1: ['VBBR3.SA', 'VIVA3.SA', 'VLID3.SA', 'WEGE3.SA', 'WIZC3.SA']}


In [19]:
clusters = []

for tickers in dividend_clusters.values():
    features_corr = get_strong_features_for_dividends(tickers)
    
    clusters.append({
        "tickers": tickers,
        "dividend_features_corr": features_corr
    })
    
    
clusters

[{'tickers': ['VAMO3.SA', 'VIVT3.SA', 'VULC3.SA', 'YDUQ3.SA', 'ZAMP3.SA'],
  'dividend_features_corr': EBIT                                                0.765349
  Pretax Income                                       0.756286
  Working Capital                                     0.716234
  Common Stock Equity                                 0.775856
  Total Equity Gross Minority Interest                0.767623
  Stockholders Equity                                 0.775856
  Tradeand Other Payables Non Current                 0.787373
  Cash Cash Equivalents And Short Term Investments    0.756647
  Cash And Cash Equivalents                           0.901718
  Cash Equivalents                                    0.866884
  Issuance Of Capital Stock                           0.712075
  End Cash Position                                   0.901718
  Changes In Cash                                     0.852452
  Common Stock Issuance                               0.712075
  Investing Cash 

In [20]:
path = Path("data/dividend_features.pkl")
path.parent.mkdir(exist_ok=True)

with open(path, mode="wb") as file:
    pickle.dump(clusters, file)