In [21]:
import os
import pickle

import pandas as pd
import yfinance as yf
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [15]:
def get_data_for_ticker(ticker):
    stock = yf.Ticker(ticker)

    # Coletando dados financeiros e históricos de dividendos
    dividends = stock.dividends
    financials = stock.financials.transpose()
    balance_sheet = stock.balance_sheet.transpose()
    cash_flow = stock.cashflow.transpose()

    # Agrupar dividendos por ano
    dividends = dividends.to_frame().reset_index()
    grouped_dividends = dividends.groupby(pd.Grouper(key='Date', freq='YE')).sum()
    grouped_dividends.index = grouped_dividends.index.tz_localize(None)

    # Consolidando todos os dados em um único DataFrame
    data_frames = [financials, balance_sheet, cash_flow]
    df = pd.concat(data_frames, axis=1)

    # Converter colunas para tipos numéricos, ignorando erros
    df = df.apply(pd.to_numeric, errors='coerce')

    # Tratar dados ausentes com interpolação
    df.interpolate(method='linear', limit_direction='both', inplace=True)
    df = df.infer_objects(copy=False)

    # Juntar "grouped_dividends" com "df" apenas onde "df" possui a data (index)
    df = df.merge(grouped_dividends, left_index=True, right_index=True)

    return df


In [40]:
def scale_dataframe_columns(df, scaler, fit=True):
    """
    Escalonar cada coluna de um DataFrame usando StandardScaler.

    Args:
    df (pd.DataFrame): DataFrame com dados a serem escalonados.

    Returns:
    pd.DataFrame: DataFrame com as colunas escalonadas.
    """
    df_scaled = df.copy()  # Cria uma cópia do DataFrame original para evitar alterações no original

    # Aplica o StandardScaler a cada coluna do DataFrame
    if fit:
        df_scaled[df_scaled.columns] = scaler.fit_transform(df_scaled)
    else:
        df_scaled[df_scaled.columns] = scaler.transform(df_scaled)

    return df_scaled

In [41]:
def create_and_save_models(param_list):
    for idx, param in enumerate(param_list):
        model_file = f'data/model_{idx + 1}.pkl'
        
        if os.path.exists(model_file):
            with open(model_file, 'rb') as f:
                model = pickle.load(f)
        else:
            model = SGDRegressor()

        features = set()
        for ticker in param['tickers']:
            columns = set(get_data_for_ticker(ticker).columns)
            
            if features:
                features = features.intersection(columns)
            else: 
                features = columns
        features = set(param['dividend_features_corr'].keys()).intersection(features)

        scaler = StandardScaler()
        for i, ticker in enumerate(param['tickers']):
            df = get_data_for_ticker(ticker)

            X = df[list(features)]
            y = df['Dividends'].values

            X_scaled = scale_dataframe_columns(X, scaler, i == 0)
            if i == 0:
                y_scaled = scaler.fit_transform(y.reshape(-1, 1)).ravel()
            else:
                y_scaled = scaler.transform(y.reshape(-1, 1)).ravel()

            # Treinamento incremental do modelo
            model.partial_fit(X_scaled, y_scaled)

        # Avaliação do modelo
        y_pred = model.predict(X_scaled)
        score = r2_score(y_scaled, y_pred)
        print(f"Modelo {idx + 1} - R²: {score:.2f}")

        # Salvamento do modelo no formato .pkl
        with open(model_file, 'wb') as f:
            pickle.dump(model, f)
        print(f"Modelo {idx + 1} salvo como '{model_file}'")

# Testing

In [38]:
get_data_for_ticker("VALE3.SA")

Unnamed: 0,Tax Effect Of Unusual Items,Tax Rate For Calcs,Normalized EBITDA,Total Unusual Items,Total Unusual Items Excluding Goodwill,Net Income From Continuing Operation Net Minority Interest,Reconciled Depreciation,Reconciled Cost Of Revenue,EBITDA,EBIT,...,Other Non Cash Items,Provisionand Write Offof Assets,Asset Impairment Charge,Depreciation Amortization Depletion,Operating Gains Losses,Earnings Losses From Equity Investments,Gain Loss On Investment Securities,Net Foreign Currency Exchange Gain Loss,Net Income From Continuing Operations,Dividends
2023-12-31,-568659000.0,0.273,17643000000.0,-2083000000.0,-2083000000.0,7983000000.0,3070000000.0,21066000000.0,15560000000.0,12490000000.0,...,294000000.0,614000000.0,266000000.0,3070000000.0,1539000000.0,1108000000.0,-1212000000.0,1643000000.0,11151000000.0,4.511889
2022-12-31,247821100.0,0.150195,22247000000.0,1650000000.0,1650000000.0,16728000000.0,3171000000.0,20898000000.0,23897000000.0,20726000000.0,...,-3174000000.0,472000000.0,-773000000.0,3171000000.0,-924000000.0,-305000000.0,-1594000000.0,975000000.0,19781000000.0,7.583326
2021-12-31,240407000.0,0.158999,32616000000.0,1512000000.0,1512000000.0,24821000000.0,3034000000.0,18737000000.0,34128000000.0,31094000000.0,...,-4707000000.0,1926000000.0,426000000.0,3034000000.0,936000000.0,1271000000.0,-203000000.0,-132000000.0,29541000000.0,14.648555
2020-12-31,-882841200.0,0.10515,20958000000.0,-8396000000.0,-8396000000.0,6605000000.0,3215000000.0,14398000000.0,12562000000.0,9347000000.0,...,2340000000.0,4747000000.0,1308000000.0,3215000000.0,2650000000.0,1020000000.0,1081000000.0,549000000.0,6990000000.0,2.40751
2019-12-31,-882841200.0,0.10515,20958000000.0,-8396000000.0,-8396000000.0,6605000000.0,3215000000.0,14398000000.0,12562000000.0,9347000000.0,...,2340000000.0,4747000000.0,1308000000.0,3215000000.0,2650000000.0,1020000000.0,1081000000.0,549000000.0,6990000000.0,1.414364


In [10]:
with open("data/dividend_features.pkl", "rb") as file:
    dividend_features = pickle.load(file)

dividend_features

[{'tickers': ['VAMO3.SA', 'VIVT3.SA', 'VULC3.SA', 'YDUQ3.SA', 'ZAMP3.SA'],
  'dividend_features_corr': {'EBIT': 0.7653490442809873,
   'Pretax Income': 0.756286020463378,
   'Working Capital': 0.7162343282702254,
   'Common Stock Equity': 0.7758555111917776,
   'Total Equity Gross Minority Interest': 0.7676230902300004,
   'Stockholders Equity': 0.7758555111917776,
   'Tradeand Other Payables Non Current': 0.7873731464048349,
   'Cash Cash Equivalents And Short Term Investments': 0.7566470819623431,
   'Cash And Cash Equivalents': 0.9017178597331811,
   'Cash Equivalents': 0.8668841889523521,
   'Issuance Of Capital Stock': 0.7120750931796491,
   'End Cash Position': 0.9017178597331811,
   'Changes In Cash': 0.8524517168275031,
   'Common Stock Issuance': 0.7120750931796491,
   'Investing Cash Flow': 0.7293202641395593,
   'Net Income From Continuing Operations': 0.756286020463378,
   'Rent Expense Supplemental': 0.9618908946708362}},
 {'tickers': ['VBBR3.SA', 'VIVA3.SA', 'VLID3.SA', '

In [42]:
create_and_save_models(dividend_features)



ValueError: X has 14 features, but StandardScaler is expecting 1 features as input.