<a href="https://colab.research.google.com/github/TiagoIesbick/dashboard-etl/blob/main/budget_forecast_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import logging
from prophet import Prophet
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
# cleaning budget unit 7001
def clean_7001(df: pd.DataFrame) -> pd.DataFrame:
  df.loc[df['Proj/Ativ'] == 2870, 'Proj/Ativ'] = 4396
  df.loc[df['Proj/Ativ'].isin([2872, 1507]), ['Proj/Ativ', 'Elemento']] = 4471, 339040
  df.loc[df['Proj/Ativ'].isin([2873, 2532]), 'Proj/Ativ'] = 4413
  df.loc[df['Proj/Ativ'].isin([1505, 1503, 1373, 1506]), 'Proj/Ativ'] = 2529
  df.loc[(df['Proj/Ativ'] == 2681) & (df['Elemento'] == 319192), 'Elemento'] = 319113
  df.loc[(df['Proj/Ativ'] == 2529) & (df['Elemento'] == 449092), 'Elemento'] = 449051
  df.loc[(df['Proj/Ativ'] == 9071) & (df['Elemento'] == 319091), 'Elemento'] = 339091
  df.loc[(df['Proj/Ativ'] == 9071) & (df['Elemento'].isin([339092, 339147])), 'Elemento'] = 339047
  df.loc[~((df['Proj/Ativ'] == 9071) & (df['Vinc. Orçam.'] == 1)), 'Vinc. Orçam.'] = 6069
  df.loc[(df['Proj/Ativ'] == 2529) & (df['Elemento'].isin([319011, 319016, 319092, 319094, 339036, 339046, 339049])), 'Proj/Ativ'] = 4396
  df.loc[(df['Proj/Ativ'] == 2529) & (df['Elemento'] == 319013), 'Proj/Ativ'] = 2680
  return df


# changing the elements 339001, 339003, 339091, 339092, 332001
def change_elements(df: pd.DataFrame) -> pd.DataFrame:
  df.loc[df['Elemento'] == 339001, 'Elemento'] = 319001
  df.loc[df['Elemento'] == 339003, 'Elemento'] = 319003
  df.loc[df['Elemento'] == 339091, 'Elemento'] = 319091
  df.loc[(df['Elemento'] == 339092) & (~df['Proj/Ativ'].isin([9075, 9077])), 'Elemento'] = 319092
  df.loc[df['Elemento'] == 332001, 'Elemento'] = 339086
  return df


# filling empty cells after the first filled cell in a column with 0
def fill_zero(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns.difference(['T', 'Comp.pagto.']):
        first_valid = df[col].first_valid_index()
        if first_valid is not None:
            df.loc[first_valid:, col] = df.loc[first_valid:, col].fillna(0)
    return df

# creating moving average dataframes
def moving_averages(df: pd.DataFrame, window: int) -> pd.DataFrame:
    df_ma = df.copy()
    df_ma.loc[:, df_ma.columns.difference(['T', 'Comp.pagto.'])] = df_ma.loc[:, df_ma.columns.difference(['T', 'Comp.pagto.'])].rolling(window).mean()
    df_ma.dropna(axis=1, how='all', inplace=True)
    return df_ma

# building prophet model
def build_prophet_model() -> Prophet:
  model = Prophet(
      # growth='logistic',
      yearly_seasonality=False,
      weekly_seasonality=False,
      daily_seasonality=False
  )
  model.add_seasonality(name='yearly', period=365.25, fourier_order=10)
  model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
  return model

# normalizes and calculates the score
def normalizes_calculates_score(df: pd.DataFrame) -> pd.DataFrame:
  scaler = MinMaxScaler()
  df['r2_norm'] = scaler.fit_transform(df[['R²']])
  df['rmse_norm'] = 1 - scaler.fit_transform(df[['RMSE']])
  df['mae_norm'] = 1 - scaler.fit_transform(df[['MAE']])

  df['score'] = (
      0.5 * df['r2_norm'] +
      0.25 * df['rmse_norm'] +
      0.25 * df['mae_norm']
  )
  return df


# checks if there's any negative forecast
# checks they don't jump by more than 30% (increase or decrease), while skipping division when the previous value is 0
def check_negative_and_jump(forecast_years: dict[int, float]) -> bool:
  values = list(forecast_years.values())
  has_negative = any(v < 0 for v in values)
  has_large_jump = any(
      values[i-1] != 0 and abs((values[i] / values[i-1]) - 1) > 0.3
      for i in range(1, len(values))
  )
  return has_negative or has_large_jump


# expense columns that will be readjusted due to mass segregation
mass_segregation_cols_exp = {
    '7002-2736-319003-6049', '7002-2738-319003-6049', '7002-2740-319003-6049', '7002-2742-319003-6049',
    '7002-2744-319003-6049', '7002-2747-319003-6049', '7002-2752-319003-6049', '7002-2754-319003-6049',
    '7002-2756-319003-6049', '7003-2760-319003-6050', '7003-2762-319003-6050', '7003-2764-319003-6050',
    '7003-2766-319003-6050', '7003-2768-319003-6050', '7003-2771-319003-6050', '7003-2776-319003-6050',
    '7003-2778-319003-6050', '7003-2780-319003-6050'
}


# revenue columns that will be readjusted due to mass segregation
mass_segregation_cols_rev = {
    'Contr.do Servidor Civil - Pensionistas - Plano em Capitalização - CMPA-6050', 'Contr.do Servidor Civil - Pensionistas - Plano em Capitalização - DMAE-6050',
    'Contr.do Serv. Civil-Pensionistas-Plano em Capitalização-Centralizada-6050', 'Contr.do Servidor Civil - Pensionistas - Plano em Capitalização - FASC-6050',
    'Contr.do Serv. Civil - Pensionistas - Plano em Capitalização - DEMHAB-6050', 'Contr.do Servidor Civil - Pensionistas - Plano em Capitalização - DMLU-6050',
    'Contr.do Serv.Civil -Pensionistas - Plano em Repartição - Centralizada-6049', 'Contr.do Servidor Civil - Pensionistas - Plano em Repartição - CMPA-6049',
    'Contr.do Servidor Civil - Pensionistas - Plano em Repartição - DMAE-6049', 'Contr.do Servidor Civil - Pensionistas - Plano em Repartição - DMLU-6049',
    'Contr.do Servidor Civil - Pensionistas - Plano em Repartição - DEMHAB-6049', 'Contr.do Servidor Civil - Pensionistas - Plano em Repartição - FASC-6049'
}

# def time_based_train_test_split(df: pd.DataFrame, min_len: int = 2) -> tuple[pd.DataFrame, pd.DataFrame | None]:
#     """
#     Perform a time-based train/test split.

#     Args:
#         df: DataFrame with time-ordered data.
#         min_len: Minimum length required to return a split.

#     Returns:
#         (train_df, test_df): Tuple of train and test DataFrames. Test can be None.
#     """
#     if len(df) >= 60:
#         train_df = df[:-12]
#         test_df = df[-12:]
#     elif len(df) >= 12:
#         split_idx = int(len(df) * 0.8)
#         train_df = df.iloc[:split_idx]
#         test_df = df.iloc[split_idx:]
#     elif len(df) >= min_len:
#         train_df = df
#         test_df = None
#     else:
#         return None, None

#     return train_df.reset_index(drop=True), test_df.reset_index(drop=True) if test_df is not None else None

# def str_to_float(value: str) -> float:
#   value = str(value).replace('.', '').replace(',', '.')
#   value = re.sub(r'[A-Za-z]|\s', '', value)
#   return float(0) if value == '' else float(value)

# def agrupamento_format(value: str) -> str:
#   if re.search(r'^0{2}', value[:2]):
#     value = '   ' + value
#   elif re.search(r'^[1-9]\d', value[:2]):
#     value = '      ' + value
#   return value

# dict_month = {'Janeiro': '01', 'Fevereiro': '02', 'Março': '03', 'Abril': '04',
#               'Maio': '05', 'Junho': '06', 'Julho': '07', 'Agosto': '08',
#               'Setembro': '09', 'Outubro': '10', 'Novembro': '11', 'Dezembro': '12'}

In [3]:
# getting expense data
df_exp = pd.read_csv(r'/content/drive/MyDrive/Dashboard_data/final_data/df_exp.csv', sep=';', parse_dates=['Comp.pagto.'])

# getting budget settlement data from 2021
df_2021 = pd.read_excel(r'/content/drive/MyDrive/Previsoes_orcamento/despesas/2021_liquidacao/Liquidações Consolidado.xls', parse_dates=['Compet.Liq.'])
df_2021['Compet.Liq.'] = pd.to_datetime(df_2021['Compet.Liq.'], dayfirst=True)
df_2021 = df_2021.loc[
    (df_2021['Compet.Estorno'] == '21/12/2021') & (df_2021['Unid.Orçam.'] == 7002),
    ['Compet.Liq.', 'Unid.Orçam.', 'Proj/Ativ', 'Rubrica', 'Vinc.Orçam.', 'Val. Liquidado']
    ].copy().rename(columns={
        'Compet.Liq.': 'Comp.pagto.',
        'Unid.Orçam.': 'Unid. Orçam.',
        'Vinc.Orçam.': 'Vinc. Orçam.',
        'Val. Liquidado': 'Result. pago'
    })
df_2021['Elemento'] = df_2021['Rubrica'].astype(str).str[:6].astype(int)

# deleting specific 2021 data to replace it with budget settlement data
df_exp = df_exp[~((df_exp['Comp.pagto.'] == '2021-12-29') & (df_exp['Unid. Orçam.'] == 7002))]
df_exp = pd.concat([df_exp, df_2021], ignore_index=True)

# correcting data
df_exp.loc[
    (df_exp['Proj/Ativ'] == 2529) & (df_exp['Rubrica'] == 339036040000),
    ['Unid. Orçam.', 'Proj/Ativ', 'Elemento', 'Rubrica', 'Vinc. Orçam.']
] = 7003, 9075, 339039, 339039030000, 6050
df_exp.loc[
    (df_exp['Proj/Ativ'] == 9042) & (df_exp['Comp.pagto.'].dt.year > 2010),
    ['Unid. Orçam.', 'Proj/Ativ', 'Elemento', 'Rubrica']
] = 7002, 9076, 339086, 339086010000

# dropping 'Rubrica" column
df_exp.drop('Rubrica', axis=1, inplace=True)

In [4]:
# Selecting and clearing data from 7001
df_7001 = clean_7001(df_exp.loc[
    (df_exp['Unid. Orçam.'] == 7001) &
    (df_exp['Vinc. Orçam.'].isin([400, 1, 6050, 6069])) &
    (df_exp['Comp.pagto.'].dt.year > 2011) # period prior to GPREV removed
].copy())

# Selecting and clearing data from 7002
df_7002 = change_elements(df_exp.loc[
    (df_exp['Unid. Orçam.'] == 7002) &
    (~df_exp['Proj/Ativ'].isin([2737, 2739, 2741, 2743, 2745, 2746, 2748, 2750, 2753, 2755, 2757, 2759]))
].copy())
df_7002['Vinc. Orçam.'] = 6049

# Selecting and clearing data from 7003
df_7003 = change_elements(df_exp.loc[
    (df_exp['Unid. Orçam.'] == 7003) &
    (~df_exp['Proj/Ativ'].isin([2761, 2763, 2765, 2767, 2769, 2770, 2772, 2774, 2777, 2779, 2781, 2783]))
].copy())
df_7003['Vinc. Orçam.'] = 6050

In [5]:
# getting revenue data
df_rev = pd.read_csv(r'/content/drive/MyDrive/Dashboard_data/final_data/df_rev.csv', sep=';', parse_dates=['Data'])

# clearing revenue data
df_rev.drop(columns=['origem', 'tipo'], inplace=True)
df_rev = df_rev[(df_rev['vinculo'].isin([6050, 6069, 6049, 400])) & (df_rev['Data'].dt.year > 2017)]
df_rev.loc[df_rev['vinculo'] == 400, 'vinculo'] = 6049
df_rev.loc[
    (df_rev['vinculo'] == 6049) &
    (df_rev['nome_rubrica'] == 'Compensações Financ entre o Regime Geral e os RPPS'),
    'nome_rubrica'] = 'Compensações Financ entre o Regime Geral e os RPPS-Plano em Repartição'
df_rev.loc[
    (df_rev['vinculo'] == 6050) &
    (df_rev['nome_rubrica'] == 'Compensações Financ entre o Regime Geral e os RPPS'),
    'nome_rubrica'] = 'Comp. Financ. entre o Regime Geral e os RPPS - Plano em Capitalização'
df_rev.loc[
    (df_rev['vinculo'] == 6049) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Cedido'),
    'nome_rubrica'] = 'Contr.do Servidor Civil Ativo - Cedido - Plano em Repartição'
df_rev.loc[
    (df_rev['vinculo'] == 6050) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Cedido'),
    'nome_rubrica'] = 'Contr.do Servidor Civil Ativo - Cedido - Plano em Capitalização'
df_rev.loc[
    (df_rev['vinculo'] == 6049) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Cedido - Multas e Juros'),
    'nome_rubrica'] = 'Contr.Serv.Civil Ativo - Cedido - Multas e Juros - Plano em Repartição'
df_rev.loc[
    (df_rev['vinculo'] == 6049) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Cedido - Dív.At.- Multas e Juros'),
    'nome_rubrica'] = 'Contr.Serv.Ativo Cedido-Dív.Ativa-Multas e Juros - Plano em Repartição'
df_rev.loc[
    (df_rev['vinculo'] == 6049) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Cedido - Dívida Ativa'),
    'nome_rubrica'] = 'Contr.Serv.Civil Ativo - Cedido - Dívida Ativa - Plano em Repartição'
df_rev.loc[
    (df_rev['vinculo'] == 6050) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Cedido - Multas e Juros'),
    'nome_rubrica'] = 'Contr.Serv.Civil Ativo-Cedido-Multas e Juros - Plano em Capitalização'
df_rev.loc[
    (df_rev['vinculo'] == 6049) &
    (df_rev['nome_rubrica'] == 'Contr.Patronal - Serv. Afastados - Plano em Repartição'),
    'nome_rubrica'] = 'Contr.Patronal - Servidores Afastados - Plano em Repartição'
df_rev.loc[
    (df_rev['vinculo'] == 6049) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Afastado'),
    'nome_rubrica'] = 'Contr.do Servidor Civil Ativo - Afastado - Plano em Repartição'
df_rev.loc[
    (df_rev['vinculo'] == 6050) &
    (df_rev['nome_rubrica'] == 'Contr.do Servidor Civil Ativo - Afastado'),
    'nome_rubrica'] = 'Contr.do Servidor Civil Ativo - Afastado - Plano em Capitalização'
df_rev.loc[df_rev['nome_rubrica'].str.contains('inativo|pensionista', case=False, regex=True)]


df_rev = df_rev[~((df_rev['nome_rubrica'].str.lower().str.contains('patr')) & (df_rev['nome_rubrica'].str.lower().str.contains('inativo')))]
df_rev = df_rev[~df_rev['nome_rubrica'].str.contains('637|750|805|supl', case=False, regex=True)]

In [6]:
# preparing to predict
df_pred_exp = pd.concat([df_7001, df_7002, df_7003], ignore_index=True)
df_pred_exp['col'] = df_pred_exp['Unid. Orçam.'].astype(str) + '-' + df_pred_exp['Proj/Ativ'].astype(str) + '-' + df_pred_exp['Elemento'].astype(str) + '-' + df_pred_exp['Vinc. Orçam.'].astype(str)
df_pred_exp = df_pred_exp[~((df_pred_exp['col'].isin(mass_segregation_cols_exp)) & (df_pred_exp['Comp.pagto.'] < '2022-05'))]

df_pred_rev = df_rev.copy().rename(columns={'Data': 'Comp.pagto.', 'valor_arrecadado': 'Result. pago'})
df_pred_rev['col'] = df_pred_rev['nome_rubrica'] + '-' + df_pred_rev['vinculo'].astype(str)
df_pred_rev = df_pred_rev[~((df_pred_rev['col'].isin(mass_segregation_cols_rev)) & (df_pred_rev['Comp.pagto.'] < '2022-05'))]
df_pred_rev = df_pred_rev[~((df_pred_rev['col'].str.contains('pensionista|inativo|926/2021', case=False, regex=True)) & (df_pred_rev['Comp.pagto.'] < '2022-01'))]
df_pred_rev = df_pred_rev[~((df_pred_rev['col'].str.contains('contr', case=False, regex=False)) & (df_pred_rev['vinculo'] == 6069) & (df_pred_rev['Comp.pagto.'].dt.year < 2022))]

df_pred = pd.concat([df_pred_exp, df_pred_rev], ignore_index=True)
df_pred['Comp.pagto.'] = df_pred['Comp.pagto.'].dt.to_period('M')
df_pred = df_pred[['Comp.pagto.', 'col', 'Result. pago']].groupby(['Comp.pagto.', 'col'], as_index=False).sum()
df_pred = df_pred.pivot(index='Comp.pagto.', columns='col', values='Result. pago')
df_pred = df_pred.iloc[:-1]
df_pred.reset_index(inplace=True)
df_pred['T'] = np.arange(1, len(df_pred)+1)
df_pred = fill_zero(df_pred)

In [7]:
# prediction interval
start = df_pred['Comp.pagto.'].max()
current_year = start.year if start.month != 12 else start.year + 1
target = pd.Period(f'{current_year + 4}-12', freq='M')
month_diff = (target - start).n
last_T = df_pred['T'].max()
X_prev = pd.DataFrame({'T': [last_T + val for val in range(1, month_diff + 1)]})
years = list(range(current_year, current_year+5))
months = [start.to_timestamp() + pd.DateOffset(months=i) for i in range(1, month_diff + 1)]

In [8]:
def run_models(df: pd.DataFrame, years: list[int], X_prev: pd.DataFrame, start: pd.Period, months: list[pd.Timestamp], month_diff: int, is_ma: bool = False) -> pd.DataFrame:
  df_models = pd.DataFrame({
      'Allocation': pd.Series(dtype='object'),
      'Model': pd.Series(dtype='object'),
      'R²': pd.Series(dtype='float'),
      'RMSE': pd.Series(dtype='float'),
      'MAE': pd.Series(dtype='float'),
      'Forecast': pd.Series(dtype='object'),
      **{year: pd.Series(dtype='float') for year in years}
  })

  def run_linear_models(X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series, df_aux: pd.DataFrame, model: str, col: str) -> pd.DataFrame:
    # df_xy = pd.DataFrame({'X': X.flatten(), 'y': y.values})

    # df_train, df_test = time_based_train_test_split(df_xy)
    # if df_train is None:
    #     return df_aux

    lr_model = LinearRegression()
    # lr_model.fit(df_train[['X']], df_train['y'])

    # # Evaluate
    # if df_test is not None:
    #     y_pred_test = lr_model.predict(df_test[['X']])
    #     r2 = r2_score(df_test['y'], y_pred_test)
    #     RMSE = np.sqrt(mean_squared_error(df_test['y'], y_pred_test))
    #     MAE = mean_absolute_error(df_test['y'], y_pred_test)
    # else:
    #     y_pred_train = lr_model.predict(df_train[['X']])
    #     r2 = r2_score(df_train['y'], y_pred_train)
    #     RMSE = np.sqrt(mean_squared_error(df_train['y'], y_pred_train))
    #     MAE = mean_absolute_error(df_train['y'], y_pred_train)

    lr_model.fit(X, y)
    forecast = lr_model.predict(X)
    r2 = r2_score(y, forecast)
    RMSE = np.sqrt(mean_squared_error(y, forecast))
    MAE = mean_absolute_error(y, forecast)

    if model == 'lin-lin':
      y_pred = lr_model.predict(X_prev.values)
    elif model == 'log-log':
      y_pred = np.exp(lr_model.predict(np.log(X_prev).values))
    elif model == 'lin-log':
      y_pred = np.exp(lr_model.predict(X_prev.values))
    elif model == 'log-lin':
      y_pred = lr_model.predict(np.log(X_prev).values)
    else:
      raise ValueError(f"Unknown model type: {model}")

    forecast_df = pd.DataFrame({'date': months, 'y_pred': y_pred})
    forecast_df['year'] = forecast_df['date'].dt.year
    forecast_df['month'] = forecast_df['date'].dt.month

    if is_ma:
      december_forecasts = forecast_df[forecast_df['month'] == 12]
      forecast_years = {year: val * 12 for year, val in zip(december_forecasts['year'], december_forecasts['y_pred'])}
    else:
      forecast_years = forecast_df.groupby('year')['y_pred'].sum().to_dict()
      if start.year in forecast_years:
        forecast_years[start.year] += df.loc[df['Comp.pagto.'].dt.year == start.year, col].sum()

    if check_negative_and_jump(forecast_years):
      return df_aux

    model_dict = {'Allocation': col, 'Model': model, 'R²': r2, 'RMSE': RMSE, 'MAE': MAE, 'Forecast': [forecast_df]}
    model_dict.update(forecast_years)
    df_aux = pd.concat([df_aux, pd.DataFrame(model_dict, index=[0])], ignore_index=True)
    return df_aux


  def run_prophet_model(df_aux: pd.DataFrame, col:str) -> pd.DataFrame:
    prophet_df = df[['Comp.pagto.', col]].copy().rename(columns={
        'Comp.pagto.': 'ds',
        col: 'y'
    }).dropna()

    # prophet_df = df[['Comp.pagto.', col]].rename(columns={
    #     'Comp.pagto.': 'ds',
    #     col: 'y'
    # }).dropna().reset_index(drop=True)

    if len(prophet_df) < 3:
      return df_aux

    prophet_df['ds'] = prophet_df['ds'].dt.to_timestamp()
    # prophet_df['floor'] = 0
    # prophet_df['cap'] = prophet_df['y'].max() * 1.1

    # df_train, df_test = time_based_train_test_split(prophet_df)
    # if df_train is None:
    #     return df_aux

    # len_test = len(df_test) if df_test is not None else 0

    model = build_prophet_model()

    # model = Prophet(yearly_seasonality=True)
    model.fit(prophet_df)
    # model.fit(df_train)

    # future = model.make_future_dataframe(periods=month_diff+len_test, freq='MS')
    future = model.make_future_dataframe(periods=month_diff, freq='MS')
    # future['floor'] = 0
    # future['cap'] = prophet_df['cap'].iloc[0]
    forecast = model.predict(future)

    # model.plot(forecast)

    # display(forecast)

    forecast_train = forecast[forecast['ds'].isin(prophet_df['ds'])]
    r2 = r2_score(prophet_df['y'], forecast_train['yhat'])
    RMSE = np.sqrt(mean_squared_error(prophet_df['y'], forecast_train['yhat']))
    MAE = mean_absolute_error(prophet_df['y'], forecast_train['yhat'])

    # if df_test is not None:
    #     forecast_test = forecast[['ds', 'yhat']].merge(df_test, on='ds', how='inner')
    #     r2 = r2_score(forecast_test['y'], forecast_test['yhat'])
    #     RMSE = np.sqrt(mean_squared_error(forecast_test['y'], forecast_test['yhat']))
    #     MAE = mean_absolute_error(forecast_test['y'], forecast_test['yhat'])
    # else:
    #     forecast_train = forecast[forecast['ds'].isin(prophet_df['ds'])]
    #     r2 = r2_score(prophet_df['y'], forecast_train['yhat'])
    #     RMSE = np.sqrt(mean_squared_error(prophet_df['y'], forecast_train['yhat']))
    #     MAE = mean_absolute_error(prophet_df['y'], forecast_train['yhat'])

    forecast_future = forecast[forecast['ds'] > prophet_df['ds'].max()]

    if is_ma:
      december_forecasts = forecast_future[forecast_future['ds'].dt.month == 12]
      forecast_years = {year: val * 12 for year, val in zip(december_forecasts['ds'].dt.year, december_forecasts['yhat'])}
    else:
      forecast_years = forecast_future.groupby(forecast_future['ds'].dt.year)['yhat'].sum().to_dict()
      if start.year in forecast_years:
        partial_actual_sum = prophet_df[prophet_df['ds'].dt.year == start.year]['y'].sum()
        forecast_years[start.year] += partial_actual_sum

    if check_negative_and_jump(forecast_years):
      return df_aux

    model_dict = {'Allocation': col, 'Model': 'prophet', 'R²': r2, 'RMSE': RMSE, 'MAE': MAE, 'Forecast': [forecast]}
    model_dict.update(forecast_years)
    df_aux = pd.concat([df_aux, pd.DataFrame(model_dict, index=[0])], ignore_index=True)
    return df_aux

  for col in df.columns.difference(['T', 'Comp.pagto.']):
    first_valid = df[col].first_valid_index()

    if first_valid is not None:
      X = df['T'][first_valid:].values.reshape(-1,1)
      y = df[col][first_valid:]

      if len(y) < 3 or y.eq(0).sum() / len(y) > 0.8:
        continue

      df_aux = pd.DataFrame()

      # lin-lin
      df_aux = run_linear_models(X, y, df_aux, 'lin-lin', col)

      # log-log
      if (y > 0).all():
        df_aux = run_linear_models(np.log(X), np.log(y), df_aux, 'log-log', col)

      # lin-log
      if (y > 0).all():
        df_aux = run_linear_models(X, np.log(y), df_aux, 'lin-log', col)

      # log-lin
      df_aux = run_linear_models(np.log(X), y, df_aux, 'log-lin', col)

      # prophet
      df_aux = run_prophet_model(df_aux, col)

      if df_aux.empty:
        continue

      # select the best model
      df_aux = df_aux.sort_values(by=['Allocation', 'R²', 'RMSE', 'MAE'], ascending=[True, False, True, True])
      df_models = pd.concat([df_models, df_aux], ignore_index=True)

  return df_models

In [10]:
# Set Prophet loggers to WARNING level
logging.getLogger('prophet').setLevel(logging.WARNING)
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)


# Run models separately
models_raw = run_models(df_pred, years, X_prev, start, months, month_diff)
models_ma12 = run_models(moving_averages(df_pred, 12), years, X_prev, start, months, month_diff, True)
models_ma36 = run_models(moving_averages(df_pred, 36), years, X_prev, start, months, month_diff, True)

# Combined results
combined = pd.concat([models_raw, models_ma12, models_ma36], keys=['raw', 'ma12', 'ma36'])
combined_normalized = combined.groupby('Allocation').apply(normalizes_calculates_score, include_groups=False)
combined_normalized.reset_index(inplace=True)
combined_normalized.drop(columns=['level_2'], inplace=True)

# chosen models
chosen_models = (
    combined_normalized
    .sort_values('score', ascending=False)
    .groupby('Allocation')
    .first()
    .rename(columns={'level_1': 'Type'})
)

In [11]:
# getting the history after cleaning
df_pred_year = (
    df_pred.drop(columns='T')
    .assign(year=df_pred['Comp.pagto.'].dt.year)
    .drop(columns='Comp.pagto.')
    .groupby('year')
    .sum()
    .T
)


# changing the name of the last column in the history, if necessary
# getting the last three columns of the history to calculate the average, if necessary
last_year_col = start.year
last_cols = df_pred_year.columns[-3:]
if start.month != 12:
  last_year_col = f"{str(start.year) + ' until ' + start.strftime('%B')}"
  df_pred_year.rename(columns={
      start.year: last_year_col
      }, inplace=True)
  last_cols = df_pred_year.columns.difference([last_year_col])[-3:]


# combining history with forecasting
df_past_future = pd.merge(df_pred_year, chosen_models[years], left_index=True, right_index=True, how='outer')


# calculating the average of the last three years for the columns not predicted by the models
df_past_future.loc[df_pred_year.index.difference(chosen_models.index), years] = df_pred_year.loc[df_pred_year.index.difference(chosen_models.index), last_cols].mean(axis=1)


if start.month != 12:
  # define conditions and multipliers
  rules = [
      {
          'pattern': '319001|319003|319011|319013|319113|319092|319094|339047',
          'starts_with': '7',
          'multiplier': 12,
          'pattern_negate': True
      },
      {
          'pattern': '319001|319003|319011|319013|319113',
          'starts_with': None,
          'multiplier': 13,
          'pattern_negate': False
      }
  ]

  for rule in rules:
      # pattern matching condition
      if rule['pattern_negate']:
          pattern_condition = ~df_past_future.index.str.contains(rule['pattern'], regex=True)
      else:
          pattern_condition = df_past_future.index.str.contains(rule['pattern'], regex=True)

      # optional: startswith condition
      startswith_condition = df_past_future.index.str.startswith(rule['starts_with']) if rule['starts_with'] else True

      # full condition
      base_proj = df_past_future[last_year_col] / start.month * rule['multiplier']
      condition = pattern_condition & startswith_condition & (base_proj > df_past_future[current_year] * 1.03)

      # ratio to scale other years
      ratio = base_proj / df_past_future[current_year]

      # assign values
      df_past_future.loc[condition, years[0]] = base_proj
      for year in years[1:]:
          df_past_future.loc[condition, year] = df_past_future[year] * ratio


# adjusting the forecast for the current year, if the forecast value is lower than the value calculated in the year
df_past_future.loc[df_past_future[current_year] < df_past_future[last_year_col], current_year] = df_past_future[last_year_col]

In [12]:
chosen_models.drop('Forecast', axis=1).to_excel('chosen_models.xlsx')
df_past_future.to_excel('df_past_future.xlsx')

In [88]:
df_past_future.loc['7001-2529-339035-6069']

Unnamed: 0,7001-2529-339035-6069
2011,0.0
2012,14640.0
2013,0.0
2014,0.0
2015,0.0
2016,0.0
2017,0.0
2018,0.0
2019,0.0
2020,6875.0


In [87]:
df_past_future.loc[(df_past_future.index.str.contains('319001|319003|319011|319013|319113', regex=True)), [2021, 2022, 2023, 2024, last_year_col] + years]

Unnamed: 0,2021,2022,2023,2024,2025 until March,2025,2026,2027,2028,2029
7001-2680-319013-6069,44221.25,54201.07,63302.02,58012.06,1461.83,61696.77,62483.09,67837.06,73130.52,87335.19
7001-2681-319113-6069,2535002.0,1678406.0,2417394.0,2199954.0,338399.99,1980329.0,1948212.0,2065821.0,1936926.0,2157318.0
7001-4396-319011-6069,10758930.0,10951800.0,13054200.0,14041960.0,3239758.94,14038960.0,15446930.0,17636220.0,18586580.0,21383370.0
7002-2736-319001-6049,43020080.0,46411480.0,52428610.0,65510670.0,15120763.51,65523310.0,68622780.0,71667700.0,74662170.0,77609760.0
7002-2736-319003-6049,0.0,4828310.0,6707062.0,6939249.0,1672863.24,7197618.0,7455460.0,7722539.0,7999185.0,8285741.0
7002-2738-319001-6049,5297739.0,6981315.0,8453408.0,9378194.0,2229016.84,11015860.0,13843350.0,17396590.0,21861850.0,27473240.0
7002-2738-319003-6049,0.0,277343.8,391835.5,395945.4,87854.31,392367.1,387845.7,383645.9,379727.9,376058.6
7002-2740-319001-6049,288298600.0,305834500.0,332452000.0,341857100.0,79188525.6,337148600.0,350024800.0,362567700.0,374804900.0,386760200.0
7002-2740-319003-6049,0.0,60441450.0,79081220.0,82044690.0,19157816.2,83485900.0,85192450.0,86933890.0,88710920.0,90524280.0
7002-2742-319001-6049,242306000.0,272728500.0,308366400.0,325843900.0,76286211.35,325527300.0,354242500.0,393951700.0,420795700.0,472627500.0


In [13]:
df_past_future.loc[(df_past_future.index.str.contains('319001|319003|319011|319013|319113', regex=True)) & (df_past_future[last_year_col]/start.month*13 > df_past_future[current_year]*1.03), [2021, 2022, 2023, 2024, last_year_col] + years]


Unnamed: 0,2021,2022,2023,2024,2025 until March,2025,2026,2027,2028,2029


In [14]:
df_past_future.loc[
    (~df_past_future.index.str.contains('319001|319003|319011|319013|319113|319092|319094|339047', regex=True)) &
    (df_past_future.index.str.startswith('7')) &
    (df_past_future[last_year_col] / start.month * 12 > df_past_future[current_year]*1.03), [last_year_col] + years]

Unnamed: 0,2025 until March,2025,2026,2027,2028,2029


In [78]:
df_adj[years]

Unnamed: 0,2025,2026,2027,2028,2029
7001-2529-339030-6069,14579.04,12314.219624,10401.233891,8785.426097,7420.630333
7001-2529-339033-6069,66156.56,70957.391531,76106.608518,81629.492505,87553.159658
7001-2529-339035-6069,171600.0,61492.040808,67920.849871,74349.658935,80778.467999
7001-2529-339093-6069,4794.48,4862.927687,4927.224406,4987.844964,5045.187094
7001-2882-339039-6069,68952.0,54303.247105,43390.517721,35118.379899,28750.276562
7001-4396-319016-6069,133507.0,143592.531216,137649.403382,132431.984417,109417.786159
7001-4396-339039-6069,133743.2,165942.995392,193285.708647,218047.753519,251206.860016


In [71]:
len(years)

5

In [75]:
if start.month != 12:
  # Define conditions and multipliers
  rules = [
      {
          'pattern': '(319001|319003|319011|319013|319113|319092|319094|339047)',
          'starts_with': '7',
          'multiplier': 12,
          'pattern_negate': True
      },
      {
          'pattern': '(319001|319003|319011|319013|319113)',
          'starts_with': None,
          'multiplier': 13,
          'pattern_negate': False
      }
  ]

  for rule in rules:
      # Pattern matching condition
      if rule['pattern_negate']:
          pattern_condition = ~df_past_future.index.str.contains(rule['pattern'], regex=True)
      else:
          pattern_condition = df_past_future.index.str.contains(rule['pattern'], regex=True)

      # Optional: startswith condition
      startswith_condition = df_past_future.index.str.startswith(rule['starts_with']) if rule['starts_with'] else True

      # Full condition
      base_proj = df_past_future[last_year_col] / start.month * rule['multiplier']
      condition = pattern_condition & startswith_condition & (base_proj > df_past_future[current_year] * 1.03)

      # Ratio to scale other years
      ratio = base_proj / df_past_future[current_year]

      # Assign values
      df_past_future.loc[condition, years[0]] = base_proj
      for year in years[1:]:
          df_past_future.loc[condition, year] = df_past_future[year] * ratio



In [None]:
df_past_future.loc[
    (~df_past_future.index.str.contains('319001|319003|319011|319013|319113|319092|319094|339047', regex=True)) &
    (df_past_future.index.str.startswith('7')) &
    (df_past_future[last_year_col] / start.month * 12 > df_past_future[current_year]*1.03),
    years] = df_past_future[last_year_col]/start.month*12, df_past_future[years[1]]*(df_past_future[last_year_col]/start.month*12)/(df_past_future[current_year])
df_past_future

In [69]:
df_adj1 = df_past_future.loc[
    (~df_past_future.index.str.contains('319001|319003|319011|319013|319113|319092|319094|339047', regex=True)) &
    (df_past_future.index.str.startswith('7')) &
    (df_past_future[last_year_col] / start.month * 12 > df_past_future[current_year]*1.03),
    [2021, 2022, 2023, 2024, last_year_col] + years].copy()
df_adj1['adj'] = df_past_future[last_year_col]/start.month*12
df_adj1

Unnamed: 0,2021,2022,2023,2024,2025 until March,2025,2026,2027,2028,2029,adj
7001-2529-339030-6069,10492.12,18388.62,17560.01,6158.83,3644.76,11266.813028,9516.539503,8038.166951,6789.456179,5734.729757,14579.04
7001-2529-339033-6069,4500.48,39107.8,23161.31,20401.12,16539.14,26225.81233,28128.960056,30170.214897,32359.598972,34707.861685,66156.56
7001-2529-339035-6069,21500.0,0.0,23625.0,14300.0,42900.0,42900.0,15373.010202,16980.212468,18587.414734,20194.617,171600.0
7001-2529-339093-6069,0.0,230.0,0.0,0.0,1198.62,1823.84813,1849.886029,1874.344876,1897.40529,1919.21857,4794.48
7001-2882-339039-6069,0.0,0.0,90576.0,20114.4,17238.0,22530.052137,17743.575074,14177.842942,11474.923571,9394.147086,68952.0
7001-4396-319016-6069,26041.53,132073.55,51750.08,26457.05,33376.75,75628.26918,81341.462268,77974.833764,75019.300602,61982.351372,133507.0
7001-4396-339039-6069,23698.84,31029.34,194028.51,123591.76,33435.8,107643.975907,133560.164539,155567.102961,175496.975745,202185.27138,133743.2


In [59]:
chosen_models.drop('Forecast', axis=1)

Unnamed: 0_level_0,Type,Model,R²,RMSE,MAE,2025,2026,2027,2028,2029,r2_norm,rmse_norm,mae_norm,score
Allocation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7001-2529-332039-6069,ma36,lin-log,0.061851,1.198777e-01,0.088822,2.712569e+03,2.666066e+03,2.620359e+03,2.575437e+03,2.531284e+03,1.000000,1.000000,1.000000,1.000000
7001-2529-339014-6069,ma36,prophet,0.649003,1.677231e+02,138.344115,1.775544e+04,1.939866e+04,2.155583e+04,2.391089e+04,2.744502e+04,1.000000,0.883809,0.872283,0.939023
7001-2529-339030-6069,ma36,lin-log,0.821450,2.328937e-01,0.190652,1.126681e+04,9.516540e+03,8.038167e+03,6.789456e+03,5.734730e+03,1.000000,1.000000,1.000000,1.000000
7001-2529-339033-6069,ma36,lin-log,0.512938,2.036282e-01,0.171847,2.622581e+04,2.812896e+04,3.017021e+04,3.235960e+04,3.470786e+04,0.963144,1.000000,1.000000,0.981572
7001-2529-339035-6069,ma36,lin-lin,0.664243,2.840366e+02,225.779049,1.376581e+04,1.537301e+04,1.698021e+04,1.858741e+04,2.019462e+04,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Restituições de Servidores - RPPS - Plano em Repartição-6049,ma12,log-log,0.738948,2.103500e-02,0.017113,1.293877e+06,9.151925e+05,6.610764e+05,4.864827e+05,3.639871e+05,1.000000,1.000000,1.000000,1.000000
Restituições de Servidores - RPPS-6050,ma36,log-log,0.781535,1.256413e-03,0.001239,5.435100e+04,5.310188e+04,5.195467e+04,5.089576e+04,4.991398e+04,1.000000,1.000000,1.000000,1.000000
Restituições de Servidores - RPPS-6069,ma36,log-log,0.304223,6.190110e-02,0.046572,6.030157e+02,5.083199e+02,4.329572e+02,3.721666e+02,3.225399e+02,1.000000,1.000000,1.000000,1.000000
Serv. de Cópias Xerográf. e/ou Cópias Heliográf. - Taxa de Adm. RPPS-6069,ma36,log-log,0.864373,2.154368e-01,0.164339,3.983014e+02,2.841420e+02,2.068970e+02,1.534088e+02,1.156038e+02,0.000000,1.000000,1.000000,0.500000


In [25]:
chosen_models

Unnamed: 0_level_0,Type,Model,R²,RMSE,MAE,Forecast,2025,2026,2027,2028,2029,r2_norm,rmse_norm,mae_norm,score
Allocation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7001-2529-332039-6069,ma36,lin-log,0.061851,1.198777e-01,0.088822,date y_pred year month 0 2025...,2.712569e+03,2.666066e+03,2.620359e+03,2.575437e+03,2.531284e+03,1.000000,1.000000,1.000000,1.000000
7001-2529-339014-6069,ma36,prophet,0.649003,1.677231e+02,138.344115,ds trend yhat_lower yha...,1.775544e+04,1.939866e+04,2.155583e+04,2.391089e+04,2.744502e+04,1.000000,0.883809,0.872283,0.939023
7001-2529-339030-6069,ma36,lin-log,0.821450,2.328937e-01,0.190652,date y_pred year month 0 202...,1.126681e+04,9.516540e+03,8.038167e+03,6.789456e+03,5.734730e+03,1.000000,1.000000,1.000000,1.000000
7001-2529-339033-6069,ma36,prophet,0.731058,2.241076e+02,187.679548,ds trend yhat_lower yha...,2.942119e+04,3.919616e+04,5.064797e+04,5.459388e+04,6.300469e+04,1.000000,0.910111,0.893587,0.950925
7001-2529-339035-6069,ma36,prophet,0.949425,1.102378e+02,85.132627,ds trend yhat_lower yha...,2.290514e+04,3.950959e+04,6.535146e+04,6.733380e+04,9.271123e+04,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Restituições de Sobra de Adiantamento de Numerário - RPPS-6069,ma36,log-log,0.929518,4.801741e-02,0.040346,date y_pred year month 0 2025...,8.555661e+02,4.132221e+02,2.085842e+02,1.094850e+02,5.950681e+01,1.000000,1.000000,1.000000,1.000000
Serv. de Cópias Xerográf. e/ou Cópias Heliográf. - Taxa de Adm. RPPS-6069,ma36,log-log,0.864373,2.154368e-01,0.164339,date y_pred year month 0 2025-...,3.983014e+02,2.841420e+02,2.068970e+02,1.534088e+02,1.156038e+02,0.131426,1.000000,1.000000,0.565713
Sobra de Adiantamento de Numerário - Taxa de Administração do RPPS-6069,raw,log-lin,0.001896,6.138878e+02,478.834809,date y_pred year month 0 2025...,4.486374e+03,6.445316e+03,7.328230e+03,8.159273e+03,8.944205e+03,1.000000,1.000000,0.000000,0.750000
Taxa de Administração - INTRA ORCAMENTÄRIA-6069,ma12,lin-log,0.994347,1.422116e-02,0.014182,date y_pred year month 0 202...,2.019899e+04,2.667726e+03,3.523324e+02,4.653332e+01,6.145758e+00,1.000000,1.000000,1.000000,1.000000


In [None]:
chosen_models.reset_index(inplace=True)
chosen_cols = ['Allocation', 'Type', 'Model', 'R²', 'RMSE', 'MAE', 2025, 2026, 2027, 2028, 2029, 'r2_norm', 'rmse_norm', 'mae_norm', 'score']
chosen_models.loc[chosen_models['Allocation'].str.contains('6069'), chosen_cols]

Unnamed: 0,Allocation,Type,Model,R²,RMSE,MAE,2025,2026,2027,2028,2029,r2_norm,rmse_norm,mae_norm,score
0,7001-2529-332039-6069,ma36,lin-log,0.061851,0.119878,0.088822,2712.569,2666.066,2620.359,2575.437,2531.284,1.0,1.0,1.0,1.0
1,7001-2529-339014-6069,ma36,prophet,0.649003,167.7231,138.344115,17755.44,19398.66,21555.83,23910.89,27445.02,1.0,0.883809,0.872283,0.939023
2,7001-2529-339030-6069,ma36,lin-log,0.82145,0.232894,0.190652,11266.81,9516.54,8038.167,6789.456,5734.73,1.0,1.0,1.0,1.0
3,7001-2529-339033-6069,ma36,prophet,0.731058,224.107557,187.679548,29421.19,39196.16,50647.97,54593.88,63004.69,1.0,0.910111,0.893587,0.950925
4,7001-2529-339035-6069,ma36,prophet,0.949425,110.237812,85.132627,22905.14,39509.59,65351.46,67333.8,92711.23,1.0,1.0,1.0,1.0
5,7001-2529-339037-6069,ma12,prophet,0.058706,5326.048775,4303.709737,339317.0,401866.6,386890.1,375894.1,270490.2,0.616341,0.683923,0.644597,0.6403
6,7001-2529-339039-6069,ma36,prophet,0.997785,2580.603328,2019.381529,398488.7,450682.0,601407.2,484871.5,482000.9,1.0,0.958211,0.954244,0.978114
7,7001-2529-339040-6069,ma36,prophet,0.999777,54.4286,43.402792,279531.8,380353.7,930886.3,970286.4,2485364.0,1.0,0.99388,0.992577,0.996614
8,7001-2529-339092-6069,ma36,lin-lin,0.140065,585.588253,448.70295,16695.03,15736.54,14778.05,13819.57,12861.08,1.0,0.898619,0.832308,0.932732
9,7001-2529-339093-6069,ma36,prophet,0.620555,98.347877,74.667809,329.9317,2203.012,3823.53,3922.363,3843.658,1.0,1.0,1.0,1.0


In [None]:
models_debug = run_models(moving_averages(df_pred[
    ['Comp.pagto.', 'T', '7001-4396-319011-6069', '7002-2740-319001-6049', '7003-2764-319001-6050',
     'Contr. Patronal - Servidor Ativo - Plano em Repartição - Centralizada-6049', 'Encargo Social - LC n.º 926/2021 - Plano em Repartição - Centralizada-6049',
     'Contr.do Serv. Civil-Pensionistas-Plano em Capitalização-Centralizada-6050', 'Contr.do Servidor Civil Inativo -Plano em Capitalização - Centralizada-6050']], 12), years, X_prev, start, months, month_diff)

******************** TCol ********************


col,Comp.pagto.,T,7001-4396-319011-6069
23,2012-12,24,6.795173e+05
24,2013-01,25,6.816804e+05
25,2013-02,26,6.861236e+05
26,2013-03,27,6.885683e+05
27,2013-04,28,6.937934e+05
...,...,...,...
166,2024-11,167,1.171851e+06
167,2024-12,168,1.170163e+06
168,2025-01,169,1.167841e+06
169,2025-02,170,1.257434e+06


******************** prophet 7001-4396-319011-6069 ********************


col,ds,y
23,2012-12,6.795173e+05
24,2013-01,6.816804e+05
25,2013-02,6.861236e+05
26,2013-03,6.885683e+05
27,2013-04,6.937934e+05
...,...,...
166,2024-11,1.171851e+06
167,2024-12,1.170163e+06
168,2025-01,1.167841e+06
169,2025-02,1.257434e+06


******************** TCol ********************


col,Comp.pagto.,T,7002-2740-319001-6049
11,2011-12,12,1.101991e+07
12,2012-01,13,1.149820e+07
13,2012-02,14,1.199013e+07
14,2012-03,15,1.207417e+07
15,2012-04,16,1.218370e+07
...,...,...,...
166,2024-11,167,2.850520e+07
167,2024-12,168,2.848810e+07
168,2025-01,169,2.849291e+07
169,2025-02,170,3.069196e+07


******************** prophet 7002-2740-319001-6049 ********************


col,ds,y
11,2011-12,1.101991e+07
12,2012-01,1.149820e+07
13,2012-02,1.199013e+07
14,2012-03,1.207417e+07
15,2012-04,1.218370e+07
...,...,...
166,2024-11,2.850520e+07
167,2024-12,2.848810e+07
168,2025-01,2.849291e+07
169,2025-02,3.069196e+07


******************** TCol ********************


col,Comp.pagto.,T,7003-2764-319001-6050
11,2011-12,12,546.313333
12,2012-01,13,1018.002500
13,2012-02,14,2792.070833
14,2012-03,15,3011.467500
15,2012-04,16,3236.028333
...,...,...,...
166,2024-11,167,607831.895000
167,2024-12,168,627734.675833
168,2025-01,169,679674.151667
169,2025-02,170,731428.725833


******************** prophet 7003-2764-319001-6050 ********************


col,ds,y
11,2011-12,546.313333
12,2012-01,1018.002500
13,2012-02,2792.070833
14,2012-03,3011.467500
15,2012-04,3236.028333
...,...,...
166,2024-11,607831.895000
167,2024-12,627734.675833
168,2025-01,679674.151667
169,2025-02,731428.725833


******************** TCol ********************


col,Comp.pagto.,T,Contr. Patronal - Servidor Ativo - Plano em Repartição - Centralizada-6049
95,2018-12,96,7.706179e+06
96,2019-01,97,7.519489e+06
97,2019-02,98,7.421794e+06
98,2019-03,99,7.304220e+06
99,2019-04,100,7.203907e+06
...,...,...,...
166,2024-11,167,5.056383e+06
167,2024-12,168,4.943410e+06
168,2025-01,169,4.887081e+06
169,2025-02,170,5.240495e+06


******************** prophet Contr. Patronal - Servidor Ativo - Plano em Repartição - Centralizada-6049 ********************


col,ds,y
95,2018-12,7.706179e+06
96,2019-01,7.519489e+06
97,2019-02,7.421794e+06
98,2019-03,7.304220e+06
99,2019-04,7.203907e+06
...,...,...
166,2024-11,5.056383e+06
167,2024-12,4.943410e+06
168,2025-01,4.887081e+06
169,2025-02,5.240495e+06


******************** TCol ********************


col,Comp.pagto.,T,Contr.do Serv. Civil-Pensionistas-Plano em Capitalização-Centralizada-6050
147,2023-04,148,428904.3475
148,2023-05,149,427493.481667
149,2023-06,150,429055.080833
150,2023-07,151,427861.529167
151,2023-08,152,428244.4375
152,2023-09,153,428784.148333
153,2023-10,154,427712.7875
154,2023-11,155,426834.028333
155,2023-12,156,425101.743333
156,2024-01,157,424255.0125


******************** prophet Contr.do Serv. Civil-Pensionistas-Plano em Capitalização-Centralizada-6050 ********************


col,ds,y
147,2023-04,428904.3475
148,2023-05,427493.481667
149,2023-06,429055.080833
150,2023-07,427861.529167
151,2023-08,428244.4375
152,2023-09,428784.148333
153,2023-10,427712.7875
154,2023-11,426834.028333
155,2023-12,425101.743333
156,2024-01,424255.0125


******************** TCol ********************


col,Comp.pagto.,T,Contr.do Servidor Civil Inativo -Plano em Capitalização - Centralizada-6050
143,2022-12,144,261349.5275
144,2023-01,145,268448.316667
145,2023-02,146,276007.383333
146,2023-03,147,282722.7825
147,2023-04,148,288219.17
148,2023-05,149,294115.095
149,2023-06,150,302215.1775
150,2023-07,151,308042.795833
151,2023-08,152,315863.0525
152,2023-09,153,323890.828333


******************** prophet Contr.do Servidor Civil Inativo -Plano em Capitalização - Centralizada-6050 ********************


col,ds,y
143,2022-12,261349.5275
144,2023-01,268448.316667
145,2023-02,276007.383333
146,2023-03,282722.7825
147,2023-04,288219.17
148,2023-05,294115.095
149,2023-06,302215.1775
150,2023-07,308042.795833
151,2023-08,315863.0525
152,2023-09,323890.828333


******************** TCol ********************


col,Comp.pagto.,T,Encargo Social - LC n.º 926/2021 - Plano em Repartição - Centralizada-6049
143,2022-12,144,83139010.0
144,2023-01,145,83974090.0
145,2023-02,146,84334950.0
146,2023-03,147,84901980.0
147,2023-04,148,84982350.0
148,2023-05,149,85412580.0
149,2023-06,150,85963210.0
150,2023-07,151,86712800.0
151,2023-08,152,87024810.0
152,2023-09,153,87452340.0


******************** prophet Encargo Social - LC n.º 926/2021 - Plano em Repartição - Centralizada-6049 ********************


col,ds,y
143,2022-12,83139010.0
144,2023-01,83974090.0
145,2023-02,84334950.0
146,2023-03,84901980.0
147,2023-04,84982350.0
148,2023-05,85412580.0
149,2023-06,85963210.0
150,2023-07,86712800.0
151,2023-08,87024810.0
152,2023-09,87452340.0


In [None]:
rubricas = df_pred_rev.columns.get_level_values('nome_rubrica').str.lower()
mask = rubricas.str.contains('682')# & ~rubricas.str.contains('outr') & ~rubricas.str.contains('afast')
df_pred_rev.loc[:, df_pred_rev.columns[mask]]

vinculo,6050,6050,6050,6050,6050,6050
nome_rubrica,Parcel. Déb. - Patr. - Termo 00682/18 - LEI 12.371/2018 - Centralizada,Parcel. Déb. - Patr. - Termo 00682/18 - LEI 12.371/2018 - DEMHAB,Parcel. Déb. - Patr. - Termo 00682/18 - LEI 12.371/2018 - DMLU,Parcel. Déb. - Patr. - Termo 00682/18 - LEI 12.371/2018 - FASC,Parcel. Déb. Patr. - Termo 682/18 - LEI 12.371/2018-DMLU-Multa e Juros,Parcel. Déb. Patr. - Termo 682/18 - LEI 12.371/2018-FASC-Multa e Juros
Data,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2018-01,,,,,,
2018-02,,,,,,
2018-03,,,,,,
2018-04,,,,,,
2018-05,,,,,,
...,...,...,...,...,...,...
2024-11,,449.98,294.68,,,
2024-12,1017166.12,,751.19,2491.54,,
2025-01,518527.81,460.79,301.76,,,
2025-02,519359.37,461.53,302.24,2542.30,,52.42
