<a href="https://colab.research.google.com/github/TiagoIesbick/dashboard-etl/blob/main/budget_forecast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import logging
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression


# Set Prophet loggers to WARNING level
logging.getLogger('prophet').setLevel(logging.WARNING)
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)


# cleaning budget unit 7001
def clean_7001(df: pd.DataFrame) -> pd.DataFrame:
  df.loc[df['Proj/Ativ'] == 2870, 'Proj/Ativ'] = 4396
  df.loc[df['Proj/Ativ'].isin([2872, 1507]), ['Proj/Ativ', 'Elemento']] = 4471, 339040
  df.loc[df['Proj/Ativ'].isin([2873, 2532]), 'Proj/Ativ'] = 4413
  df.loc[df['Proj/Ativ'].isin([1505, 1503, 1373, 1506]), 'Proj/Ativ'] = 2529
  df.loc[(df['Proj/Ativ'] == 2681) & (df['Elemento'] == 319192), 'Elemento'] = 319113
  df.loc[(df['Proj/Ativ'] == 2529) & (df['Elemento'] == 449092), 'Elemento'] = 449051
  df.loc[(df['Proj/Ativ'] == 9071) & (df['Elemento'] == 319091), 'Elemento'] = 339091
  df.loc[(df['Proj/Ativ'] == 9071) & (df['Elemento'].isin([339092, 339147])), 'Elemento'] = 339047
  df.loc[~((df['Proj/Ativ'] == 9071) & (df['Vinc. Orçam.'] == 1)), 'Vinc. Orçam.'] = 6069
  df.loc[(df['Proj/Ativ'] == 2529) & (df['Elemento'].isin([319011, 319016, 319092, 319094, 339036, 339046, 339049])), 'Proj/Ativ'] = 4396
  df.loc[(df['Proj/Ativ'] == 2529) & (df['Elemento'] == 319013), 'Proj/Ativ'] = 2680
  return df


# changing the elements 339001, 339003, 339091, 339092, 332001
def change_elements(df: pd.DataFrame) -> pd.DataFrame:
  df.loc[df['Elemento'] == 339001, 'Elemento'] = 319001
  df.loc[df['Elemento'] == 339003, 'Elemento'] = 319003
  df.loc[df['Elemento'] == 339091, 'Elemento'] = 319091
  df.loc[(df['Elemento'] == 339092) & (~df['Proj/Ativ'].isin([9075, 9077])), 'Elemento'] = 319092
  df.loc[df['Elemento'] == 332001, 'Elemento'] = 339086
  return df


# filling empty cells after the first filled cell in a column with 0
def fill_zero(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns[1:]:
        first_valid = df[col].first_valid_index()
        if first_valid is not None:
            df.loc[first_valid:, col] = df.loc[first_valid:, col].fillna(0)
    return df


# creating moving average dataframes
def moving_averages(df: pd.DataFrame, window: int) -> pd.DataFrame:
    df_ma = df.copy()
    df_ma.loc[:, df_ma.columns.difference(['T', 'Comp.pagto.'])] = df_ma.loc[:, df_ma.columns.difference(['T', 'Comp.pagto.'])].rolling(window).mean()
    df_ma.dropna(axis=1, how='all', inplace=True)
    return df_ma


# building prophet model
def build_prophet_model() -> Prophet:
  model = Prophet(
  yearly_seasonality=False,
  weekly_seasonality=False,
  daily_seasonality=False
  )
  model.add_seasonality(name='yearly', period=365.25, fourier_order=10)
  model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
  return model


# columns impacted by mass segregation
mass_segregation_cols = [
    '7002-2736-319003-6049', '7002-2738-319003-6049', '7002-2740-319003-6049', '7002-2742-319003-6049',
    '7002-2744-319003-6049', '7002-2747-319003-6049', '7002-2752-319003-6049', '7002-2754-319003-6049',
    '7002-2756-319003-6049', '7003-2760-319003-6050', '7003-2762-319003-6050', '7003-2764-319003-6050',
    '7003-2766-319003-6050', '7003-2768-319003-6050', '7003-2771-319003-6050', '7003-2776-319003-6050',
    '7003-2778-319003-6050', '7003-2780-319003-6050'
]


# running models
def run_models(df: pd.DataFrame, years: list[int], X_prev: pd.DataFrame, start: pd.Period, months: list[pd.Timestamp], month_diff: int, is_ma: bool = False) -> pd.DataFrame:
  df_models = pd.DataFrame(columns=(['Modelo', 'R²', 'RMSE', 'MAE'] + years))

  # running linear models
  def run_linear_models(X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series, df_aux: pd.DataFrame, model: str, col: str) -> pd.DataFrame:
    lr_model = LinearRegression()
    lr_model.fit(X, y)
    forecast = lr_model.predict(X)
    r2 = r2_score(y, forecast)
    RMSE = np.sqrt(mean_squared_error(y, forecast))
    MAE = mean_absolute_error(y, forecast)

    if model == 'lin-lin':
      y_pred = lr_model.predict(X_prev.values)
    elif model == 'log-log':
      y_pred = np.exp(lr_model.predict(np.log(X_prev).values))
    elif model == 'lin-log':
      y_pred = np.exp(lr_model.predict(X_prev.values))
    elif model == 'log-lin':
      y_pred = lr_model.predict(np.log(X_prev).values)
    else:
      raise ValueError(f"Unknown model type: {model}")

    forecast_df = pd.DataFrame({'date': months, 'y_pred': y_pred})
    forecast_df['year'] = forecast_df['date'].dt.year
    forecast_df['month'] = forecast_df['date'].dt.month

    if is_ma:
      december_forecasts = forecast_df[forecast_df['month'] == 12]
      forecast_years = {year: val * 12 for year, val in zip(december_forecasts['year'], december_forecasts['y_pred'])}
    else:
      forecast_years = forecast_df.groupby('year')['y_pred'].sum().to_dict()
      if start.year in forecast_years:
        forecast_years[start.year] += df.loc[df['Comp.pagto.'].dt.year == start.year, col].sum()

    if any(v < 0 for v in forecast_years.values()):
      return df_aux

    model_dict = {'Modelo': model, 'R²': r2, 'RMSE': RMSE, 'MAE': MAE}
    model_dict.update(forecast_years)
    df_aux = pd.concat([df_aux, pd.DataFrame(model_dict, index=[0])], ignore_index=True)
    return df_aux

  # running prophet model
  def run_prophet_model(df_aux: pd.DataFrame, col:str) -> pd.DataFrame:
    df_prophet = df[['Comp.pagto.', col]].copy().rename(columns={
        'Comp.pagto.': 'ds',
        col: 'y'
    }).dropna()

    # adjustment for mass segregation
    if col in mass_segregation_cols:
        df_prophet.loc[df_prophet['ds'] < '2022-05', 'y'] = np.nan
        df_prophet.dropna(inplace=True)

    if len(df_prophet) < 3:
            return df_aux

    df_prophet['ds'] = df_prophet['ds'].dt.to_timestamp()

    prophet_model = build_prophet_model()
    prophet_model.fit(df_prophet)
    future = prophet_model.make_future_dataframe(periods=month_diff, freq='MS')
    forecast = prophet_model.predict(future)

    forecast_train = forecast[forecast['ds'].isin(df_prophet['ds'])]
    r2 = r2_score(df_prophet['y'], forecast_train['yhat'])
    RMSE = np.sqrt(mean_squared_error(df_prophet['y'], forecast_train['yhat']))
    MAE = mean_absolute_error(df_prophet['y'], forecast_train['yhat'])

    forecast_future = forecast[forecast['ds'] > df_prophet['ds'].max()]

    if is_ma:
      december_forecasts = forecast_future[forecast_future['ds'].dt.month == 12]
      forecast_years = {year: val * 12 for year, val in zip(december_forecasts['ds'].dt.year, december_forecasts['yhat'])}
    else:
      forecast_years = forecast_future.groupby(forecast_future['ds'].dt.year)['yhat'].sum().to_dict()
      if start.year in forecast_years:
        partial_actual_sum = df_prophet[df_prophet['ds'].dt.year == start.year]['y'].sum()
        forecast_years[start.year] += partial_actual_sum

    if any(v < 0 for v in forecast_years.values()):
      return df_aux

    model_dict = {'Modelo': 'prophet', 'R²': r2, 'RMSE': RMSE, 'MAE': MAE}
    model_dict.update(forecast_years)
    df_aux = pd.concat([df_aux, pd.DataFrame(model_dict, index=[0])], ignore_index=True)
    return df_aux


  for col in df.columns.difference(['T', 'Comp.pagto.']):
    first_valid = df[col].first_valid_index()

    # adjustment for mass segregation
    if col in mass_segregation_cols:
      first_valid = df.loc[df['Comp.pagto.'] > '2022-04', col].first_valid_index()

    if first_valid is not None:
      X = df['T'][first_valid:].values.reshape(-1,1)
      y = df[col][first_valid:]

      if y.eq(0).sum() / len(y) > 0.8:
        continue

      df_aux = pd.DataFrame()

      # lin-lin
      df_aux = run_linear_models(X, y, df_aux, 'lin-lin', col)

      # log-log
      if (y > 0).all():
        df_aux = run_linear_models(np.log(X), np.log(y), df_aux, 'log-log', col)

      # lin-log
      if (y > 0).all():
        df_aux = run_linear_models(X, np.log(y), df_aux, 'lin-log', col)

      # log-lin
      df_aux = run_linear_models(np.log(X), y, df_aux, 'log-lin', col)

      # prophet
      df_aux = run_prophet_model(df_aux, col)

      if df_aux.empty:
        continue

      # select the best model
      df_aux = df_aux.sort_values(by=['R²', 'RMSE', 'MAE'], ascending=[False, True, True])
      df_models.loc[col] = df_aux.iloc[0, :]

  return df_models


# getting expense data
df_exp = pd.read_csv(r'/content/drive/MyDrive/Dashboard_data/final_data/df_exp.csv', sep=';', parse_dates=['Comp.pagto.'])
df_exp.loc[
    (df_exp['Proj/Ativ'] == 2529) & (df_exp['Rubrica'] == 339036040000),
    ['Unid. Orçam.', 'Proj/Ativ', 'Elemento', 'Rubrica', 'Vinc. Orçam.']
] = 7003, 9075, 339039, 339039030000, 6050
df_exp.loc[
    (df_exp['Proj/Ativ'] == 9042) & (df_exp['Comp.pagto.'].dt.year > 2010),
    ['Unid. Orçam.', 'Proj/Ativ', 'Elemento', 'Rubrica']
] = 7002, 9076, 339086, 339086010000
df_exp.drop('Rubrica', axis=1, inplace=True)


# Selecting and clearing data from 7001
df_7001 = clean_7001(df_exp.loc[
    (df_exp['Unid. Orçam.'] == 7001) &
    (df_exp['Vinc. Orçam.'].isin([400, 1, 6050, 6069])) &
    (df_exp['Comp.pagto.'].dt.year > 2011) # period prior to GPREV removed
].copy())


# Selecting and clearing data from 7002
df_7002 = change_elements(df_exp.loc[
    (df_exp['Unid. Orçam.'] == 7002) &
    (~df_exp['Proj/Ativ'].isin([2737, 2739, 2741, 2743, 2745, 2746, 2748, 2750, 2753, 2755, 2757, 2759]))
].copy())
df_7002['Vinc. Orçam.'] = 6049


# Selecting and clearing data from 7003
df_7003 = change_elements(df_exp.loc[
    (df_exp['Unid. Orçam.'] == 7003) &
    (~df_exp['Proj/Ativ'].isin([2761, 2763, 2765, 2767, 2769, 2770, 2772, 2774, 2777, 2779, 2781, 2783]))
].copy())
df_7003['Vinc. Orçam.'] = 6050


# preparing to predict
df_pred = pd.concat([df_7001, df_7002, df_7003], ignore_index=True)
df_pred['Comp.pagto.'] = df_pred['Comp.pagto.'].dt.to_period('M')
df_pred['col'] = df_pred['Unid. Orçam.'].astype(str) + '-' + df_pred['Proj/Ativ'].astype(str) + '-' + df_pred['Elemento'].astype(str) + '-' + df_pred['Vinc. Orçam.'].astype(str)
df_pred = df_pred[['Comp.pagto.', 'col', 'Result. pago']].groupby(['Comp.pagto.', 'col'], as_index=False).sum()
df_pred = df_pred.pivot(index='Comp.pagto.', columns='col', values='Result. pago')
df_pred = df_pred.iloc[:-1]
df_pred.reset_index(inplace=True)
df_pred['total_7001'] = df_pred.loc[:, df_pred.columns.str.startswith('7001')].sum(axis=1)
df_pred['total_7002'] = df_pred.loc[:, df_pred.columns.str.startswith('7002')].sum(axis=1)
df_pred['total_7003'] = df_pred.loc[:, df_pred.columns.str.startswith('7003')].sum(axis=1)
df_pred['total_7002_7003'] = df_pred.loc[:, df_pred.columns[df_pred.columns.str.match(r'^(7002|7003)')]].sum(axis=1)
df_pred.loc[df_pred['Comp.pagto.'].dt.year < 2012, 'total_7001'] = np.nan
df_pred['T'] = np.arange(1, len(df_pred)+1)
df_pred = fill_zero(df_pred)


# prediction interval
start = df_pred['Comp.pagto.'].max()
current_year = start.year if start.month != 12 else start.year + 1
target = pd.Period(f'{current_year + 4}-12', freq='M')
month_diff = (target - start).n
last_T = df_pred['T'].max()
X_prev = pd.DataFrame({'T': [last_T + val for val in range(1, month_diff + 1)]})
years = list(range(current_year, current_year+5))
months = [start.to_timestamp() + pd.DateOffset(months=i) for i in range(1, month_diff + 1)]


# Run models separately
models_raw = run_models(df_pred, years, X_prev, start, months, month_diff)
models_ma12 = run_models(moving_averages(df_pred, 12), years, X_prev, start, months, month_diff, True)
models_ma36 = run_models(moving_averages(df_pred, 36), years, X_prev, start, months, month_diff, True)


# Combined results
df_combined = pd.concat([models_raw, models_ma12, models_ma36], keys=['raw', 'ma12', 'ma36'])

In [6]:
df_combined

Unnamed: 0,Unnamed: 1,Modelo,R²,RMSE,MAE,2025,2026,2027,2028,2029
raw,7001-2529-339014-6069,prophet,0.209714,1282.541102,928.210766,22432.352885,26085.512827,26555.863243,27714.486119,25576.965911
raw,7001-2529-339030-6069,log-lin,0.125252,3243.041992,2364.382816,15158.393548,14004.925173,12553.78154,11187.89125,9897.788193
raw,7001-2529-339033-6069,prophet,0.095773,2391.376351,1733.4087,28720.826089,14171.048809,12670.36871,12357.07843,14514.762785
raw,7001-2529-339037-6069,prophet,0.093844,16046.234683,11411.247108,318575.581399,413812.240061,481515.670115,517635.287648,499462.600325
raw,7001-2529-339039-6069,lin-log,0.709486,0.370839,0.290256,491848.069539,413813.613111,355633.379425,305633.010984,262662.457484
...,...,...,...,...,...,...,...,...,...,...
ma36,7003-9077-339086-6050,prophet,0.660971,369.32753,304.938174,12713.536733,7648.309902,6889.037496,7827.004877,12292.5787
ma36,total_7001,prophet,0.991163,40647.298291,26127.528846,43191377.25951,43515193.985837,42874234.324598,47828706.491151,50226152.868046
ma36,total_7002,log-log,0.973554,0.050033,0.032659,1771801824.048705,1854744151.209435,1936189857.368034,2016251774.082534,2095028464.629416
ma36,total_7002_7003,prophet,0.97335,5143282.120705,3457792.390231,2004617698.505512,1940262169.953136,1776126448.124848,1947017435.185963,1922259835.855738


In [4]:
df_combined

Unnamed: 0,Unnamed: 1,Modelo,R²,RMSE,MAE,2025,2026,2027,2028,2029
raw,7001-2529-332039-6069,prophet,0.296551,658.55168,353.671082,-301.297688,-5068.803858,-11761.702481,-15941.053031,-22232.707589
raw,7001-2529-339014-6069,prophet,0.209714,1282.541102,928.210766,22432.352885,26085.512827,26555.863243,27714.486119,25576.965911
raw,7001-2529-339030-6069,prophet,0.244395,3014.105507,2244.57469,14829.840292,9980.185255,6546.331626,2919.7963,-2379.977361
raw,7001-2529-339033-6069,prophet,0.095773,2391.376351,1733.4087,28720.826089,14171.048809,12670.36871,12357.07843,14514.762785
raw,7001-2529-339035-6069,prophet,0.242215,2852.011198,1540.728107,86679.050602,91017.28947,128934.19347,139002.057541,158598.551403
...,...,...,...,...,...,...,...,...,...,...
ma36,7001-9071-339047-1,prophet,1.0,17.791117,13.123173,-27622608.593065,-36735782.218639,-1295623.789308,17936211.179576,72793473.009564
ma36,7001-9071-339047-6069,prophet,0.929448,11826.212304,8428.813852,6573129.409518,6766641.401272,7144439.250178,7073933.721523,7670989.708818
ma36,7001-9071-339091-6069,prophet,0.513709,331.221419,240.046605,7626.765034,3803.428065,588.77716,399.849039,-744.051149
ma36,7001-9071-339093-6069,prophet,0.564406,711.415842,479.370555,-6560.356821,-11148.655626,-16065.198819,-20846.902431,-25982.08763
