In [2]:
!pip install yfinance pandas numpy --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [14]:
import yfinance as yf
import pandas as pd

from typing import Dict, List
from pathlib import Path
import pickle
from pprint import pprint
from dataclasses import dataclass

In [15]:
def get_correlations(ticker: str) -> pd.DataFrame:
    stock = yf.Ticker(ticker)

    # Coletando dados financeiros e históricos de dividendos
    dividends = stock.dividends
    financials = stock.financials.transpose()
    balance_sheet = stock.balance_sheet.transpose()
    cash_flow = stock.cashflow.transpose()

    # Agrupar dividendos por ano
    dividends = dividends.to_frame().reset_index()
    grouped_dividends = dividends.groupby(pd.Grouper(key='Date', freq='YE')).sum()
    grouped_dividends.index = grouped_dividends.index.tz_localize(None)

    # Consolidando todos os dados em um único DataFrame
    data_frames = [financials, balance_sheet, cash_flow]
    df = pd.concat(data_frames, axis=1)

    # Converter colunas para tipos numéricos, ignorando erros
    df = df.apply(pd.to_numeric, errors='coerce')

    # Tratar dados ausentes com interpolação
    df.interpolate(method='linear', limit_direction='both', inplace=True)
    df = df.infer_objects(copy=False)

    # Juntar "grouped_dividends" com "df" apenas onde "df" possui a data (index)
    df = df.merge(grouped_dividends, left_index=True, right_index=True)

    # Calculando a correlação
    correlation_matrix = df.corr()
    
    if 'Dividends' not in correlation_matrix:
        return None
    
    # Retornar apenas a coluna 'Dividends' da correlação
    else:
        dividend_corr = correlation_matrix['Dividends'].dropna()
        return dividend_corr

In [21]:
def get_strong_features_for_dividends(tickers: List[str]) -> pd.Series:
    correlations = []
    
    # Coletar correlações para todos os ativos
    for ticker in tickers:
        corr = get_correlations(ticker)

        if corr is None:
            continue
            
        correlations.append(corr)

    # Convertendo a lista de correlações em um DataFrame
    correlation_df = pd.DataFrame(correlations)
    
    # Calculando a média das correlações
    mean_correlation = correlation_df.mean().dropna()
    
    # Filtrando os componentes financeiros com forte correlação
    strong_correlations = mean_correlation[(abs(mean_correlation) >= 0.7) & (abs(mean_correlation) < 1.0)]
    
    return strong_correlations

In [22]:
@dataclass
class Cluster:
    tickers: List[str]
    dividend_features_corr: pd.Series

# Testing

In [23]:
with open("data/dividend_clusters.pkl", mode="rb") as file:
    dividend_clusters = pickle.load(file)
    pprint(dividend_clusters)

{0: ['VAMO3.SA', 'VIVT3.SA', 'VULC3.SA', 'YDUQ3.SA', 'ZAMP3.SA'],
 1: ['VBBR3.SA', 'VIVA3.SA', 'VLID3.SA', 'WEGE3.SA', 'WIZC3.SA']}


In [24]:
clusters = []

for tickers in dividend_clusters.values():
    features_corr = get_strong_features_for_dividends(tickers)
    clusters.append(Cluster(tickers, features_corr))
    
    
clusters

[Cluster(tickers=['VAMO3.SA', 'VIVT3.SA', 'VULC3.SA', 'YDUQ3.SA', 'ZAMP3.SA'], dividend_features_corr=Goodwill                                         -0.779052
 Cash And Cash Equivalents                         0.844873
 Cash Equivalents                                  0.842838
 End Cash Position                                 0.844873
 Gains Losses Not Affecting Retained Earnings      0.780313
 Other Equity Adjustments                          0.780313
 Non Current Deferred Assets                      -0.780946
 Restricted Cash                                  -0.813400
 Cash Flow From Continuing Investing Activities    0.729320
 Net Other Investing Changes                       0.771628
 Operating Gains Losses                           -0.709049
 Pension And Employee Benefit Expense              0.841420
 Net Foreign Currency Exchange Gain Loss          -0.767022
 Restructuring And Mergern Acquisition             0.773324
 Retained Earnings                                 0.845011

In [25]:
path = Path("data/dividend_features.pkl")
path.parent.mkdir(exist_ok=True)

with open(path, mode="wb") as file:
    pickle.dump(clusters, file)