In [2]:
%pip install pandas -U -q
%pip install numpy -U -q
%pip install plotly -U -q
%pip install nbformat -U -q
%pip install scikit-learn -U -q
%pip install seaborn -q
%pip install matplotlib -q
%pip install pickle -U -q

# Análise exploratório dos dados de fluxo de caixa das empresas brasileiras

In [9]:
import pandas as pd

predicted_cash_flow_components = pd.read_csv("./assets/predicted_cash_flow_components.csv", header=[0, 1], index_col=0)
predicted_cash_flow_components

In [11]:
df = predicted_cash_flow_components["VALE3.SA"].dropna().transpose()
df

In [12]:
X_cashflow = df.iloc[:, 1:4]
y_cashflow = df.iloc[:, 0]

In [13]:
X_cashflow.shape

## Previsão do fluxo de caixa livre

In [21]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd
import numpy as np

def train_model_and_predict(model, X, y):
  X_scaler = MinMaxScaler()
  X_scaled = X_scaler.fit_transform(X.reshape(-1, 1))

  y_scaler = MinMaxScaler()
  y_scaled = y_scaler.fit_transform(y.reshape(-1, 1))

  model.fit(X_scaled, y_scaled.ravel())
  predictions = model.predict(X_scaled)

  predictions_real_scale = y_scaler.inverse_transform(predictions.reshape(-1, 1))
  return predictions_real_scale

def score_model(y_real, y):
  try:
    r2 = r2_score(y_real, y)
    mae = mean_absolute_error(y_real, y)
  except:
    r2 = 0.0
    mae = 0.0
  
  print(f"Coeficiente de determinação (R^2): {r2}") # Quanto mais próximo de 1, melhor é o modelo
  print(f"Erro médio absoluto (MAE): {mae:,.2f}") # Quanto menor o valor, melhor é o modelo

  return r2, mae

def combine_attributes(df, y_column_name, model):
  correlation_matrix = df.corr()[y_column_name]

  # Filtra os atributos com correlação >= 70%
  high_corr = correlation_matrix[correlation_matrix >= 0.7].iloc[1:]

  # Atributos com correlação forte com a coluna "y_column"
  X_columns_name = high_corr.sort_values().index.to_list()
  X_columns_index = [df.columns.get_loc(column) for column in X_columns_name]

  y_attribute = df.iloc[:, df.columns.get_loc(y_column_name)].values
  predictions = {}

  for name, index in zip(X_columns_name, X_columns_index):
    X_attribute = df.iloc[:, index].values

    prediction = train_model_and_predict(model, X_attribute, y_attribute)
    predictions.update({f"{name} X {y_column_name}": prediction.ravel()})

  df = pd.DataFrame(predictions)
  df["Mean"] = df.mean(axis=1)
  
  return df
  

### Regressão linear

In [22]:
from sklearn.linear_model import LinearRegression

cash_flow_linear_regressor = LinearRegression()
df_predictions = combine_attributes(df, "Free Cash Flow", cash_flow_linear_regressor)
df_predictions

In [23]:
score_model(df.iloc[:, 0], df_predictions["Mean"])

### Redes neurais

In [24]:
from sklearn.neural_network import MLPRegressor

cash_flow_mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=0)
df_predictions = combine_attributes(df, "Free Cash Flow", cash_flow_linear_regressor)
df_predictions

In [25]:
score_model(df.iloc[:, 0], df_predictions["Mean"])

## Testar com outras ações

In [26]:
unique_tickers = set(ticker for ticker, _ in brazilian_companies_cashflow.columns)
tickers = list(unique_tickers)
tickers

In [28]:
cash_flow_linear_regressor = LinearRegression()
r2_scores = []

for ticker in tickers:
  df = predicted_cash_flow_components[ticker].dropna().transpose()
  
  df_predictions = combine_attributes(df, "Free Cash Flow", cash_flow_linear_regressor)

  print(f"{'-' * 20} {ticker} {'-' * 20}")
  r2, _ = score_model(df.iloc[:, 0], df_predictions["Mean"])

  r2_scores.append(r2)

print(f"\n\nCoeficiente de determinação (R^2) médio: {np.mean(r2_scores)}")

## Outra abordagem de previsão de fluxo de caixa

In [50]:
from typing import List, Tuple
from pandas import DataFrame
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error


def standardization(df: DataFrame, X_columns: List[str], y_column: str) -> Tuple[any, any, any, any]:
    X_index = [df.columns.get_loc(column) for column in X_columns]

    X_real = df.iloc[:, X_index]
    X_scaler = MinMaxScaler()
    X_scaled = X_scaler.fit_transform(X_real)

    y_real = df[y_column]
    y_scaler = MinMaxScaler()
    y_scaled = y_scaler.fit_transform(y_real.reshape(-1, 1))

    return X_real, X_scaled, y_real, y_scaled


def predict_free_cash_flow(components: DataFrame) -> DataFrame:
    X_data = components.iloc[:, 1:]
    y_data = components.iloc[:, 0]

    if "Free Cash Flow" not in components.columns:
        raise Exception("DataFrame has no 'Free Cash Flow' column.")

    X_train, X_test, y_train, y_test = train_test_split(
        X_data, y_data, random_state=0)
    
    poly = PolynomialFeatures(degree=4)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    polynomial_regression = LinearRegression()
    polynomial_regression.fit(X_train_poly, y_train)
    predictions = polynomial_regression.predict(X_test_poly)

    print("-" * 50)
    score_train = polynomial_regression.score(X_train_poly, y_train)
    print(f"Train score: {score_train * 100:.2f}%")

    score_test = polynomial_regression.score(X_test_poly, y_test)
    print(f"Train score: {score_test * 100:.2f}%")

    r2 = r2_score(y_test, predictions)
    print(f"Coeficiente de determinação (R^2): {r2}") # Quanto mais próximo de 1, melhor é o modelo

    mae = mean_absolute_error(y_test, predictions)
    print(f"Erro médio absoluto (MAE): {mae:,.2f}") # Quanto menor o valor, melhor é o modelo
    print("-" * 50)

In [51]:
components = predicted_cash_flow_components.transpose()

X_data = components.iloc[:, 1:]
y_data = components.iloc[:, 0]

In [52]:
X_data.shape, y_data.shape

In [53]:
predict_free_cash_flow(components)

## Salvar modelo já treinado

In [206]:
import pickle
from pathlib import Path

models_dir = Path("models")

if not models_dir.exists():
  models_dir.mkdir()

pickle.dump(cash_flow_linear_regressor, open('models/future_cash_flow_model.sav', 'wb'))
