# Predictions

In [55]:
import pandas as pd
import sketch

import lightgbm as lgb 
from sklearn.metrics import r2_score, mean_absolute_error
import pickle

In [9]:
df_moex.sketch.howto("Rename an unnamed column")

In [24]:
df_moex = pd.read_csv("clean_data/pred_moex.csv")
df_moex.rename(columns={'Unnamed: 0': "quarter", "value":"moex"}, inplace=True)
df_moex.head(3)

Unnamed: 0,quarter,moex
0,2022-01-01,461.0
1,2022-04-01,461.0
2,2022-07-01,461.0


In [21]:
df_bond = pd.read_csv("clean_data/pred_bond.csv")
df_bond.rename(columns={'Unnamed: 0': "quarter", "value":"bond"}, inplace=True)
df_bond.head(3)

Unnamed: 0,quarter,bond
0,2022-01-01,63606320000.0
1,2022-04-01,88043850000.0
2,2022-07-01,79018860000.0


In [22]:
df_ppl = pd.read_csv("clean_data/pred_count_people.csv")
df_ppl.rename(columns={'Unnamed: 0': "quarter", "value":"ppl"}, inplace=True)
df_ppl.head(3)

Unnamed: 0,quarter,ppl
0,2022-01-01,4496.70867
1,2022-04-01,4417.655017
2,2022-07-01,4523.592399


In [23]:
df_employ = pd.read_csv("clean_data/pred_unemploy.csv")
df_employ.rename(columns={'Unnamed: 0': "quarter", "value":"employ"}, inplace=True)
df_employ.head(3)

Unnamed: 0,quarter,employ
0,2022-01-01,13778.757049
1,2022-04-01,13641.73742
2,2022-07-01,13687.231508


In [26]:
pred_df = pd.merge(pd.merge(pd.merge(df_moex, df_bond, on='quarter'), df_ppl, on='quarter'), df_employ, on='quarter')
pred_df.head()

Unnamed: 0,quarter,moex,bond,ppl,employ
0,2022-01-01,461.0,63606320000.0,4496.70867,13778.757049
1,2022-04-01,461.0,88043850000.0,4417.655017,13641.73742
2,2022-07-01,461.0,79018860000.0,4523.592399,13687.231508
3,2022-10-01,461.0,69268840000.0,4536.16598,13929.10598
4,2023-01-01,461.0,72252150000.0,4559.398183,14181.101295


In [32]:
pred_df['year'] = pd.to_datetime(pred_df.quarter).dt.year
pred_df['month'] = pd.to_datetime(pred_df.quarter).dt.month
pred_df['day'] = pd.to_datetime(pred_df.quarter).dt.day
pred_df.drop(columns=['quarter'], inplace=True)
pred_df.head(3)

Unnamed: 0,moex,bond,ppl,employ,year,month,day
0,461.0,63606320000.0,4496.70867,13778.757049,2022,1,1
1,461.0,88043850000.0,4417.655017,13641.73742,2022,4,1
2,461.0,79018860000.0,4523.592399,13687.231508,2022,7,1


In [53]:
with open('model/placeholder.pkl', 'rb') as f:
    params = pickle.load(f)
params["verbose"] = -1

In [39]:
quarterly_data = pd.read_csv('clean_data/quarterly_data.csv', index_col=0)
dataframes_slctn_cluster = [None]*4
# Select all columns except cluster and slctn_number by specific value of cluster AND slctn_number
cluster_vals = range(4)
slctn_number_vals = range(4)

# Get the columns to keep
columns_to_keep = quarterly_data.columns.difference(['cluster', 'slctn_nmbr'])

for cluster_val in cluster_vals:
    dataframes_slctn_cluster[cluster_val] = [None] * 4
    for slctn_number_val in slctn_number_vals:
        filtered_df = quarterly_data[(quarterly_data['cluster'] == cluster_val) & (quarterly_data['slctn_nmbr'] == slctn_number_val)]
        res = filtered_df[columns_to_keep]
        #res['prev_npo_sum'] = res['npo_sum'].shift(1,fill_value=res.iloc[0]['npo_sum'])
        dataframes_slctn_cluster[cluster_val][slctn_number_val] = res

In [43]:
def train_test_split(df):
    train_size = int(len(df) * 0.8)
    train_data, test_data = df.iloc[:train_size, :], df.iloc[train_size:, :]
    X_train = train_data.loc[:, df.columns != "npo_sum"]
    y_train = train_data.loc[:, "npo_sum"]
    X_test = test_data.loc[:, df.columns != "npo_sum"]
    y_test = test_data.loc[:, "npo_sum"]
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = train_test_split(dataframes_slctn_cluster[3][3])

In [54]:
model = lgb.LGBMRegressor(**params, )
model.fit(X_train, y_train)
pred = model.predict(X_test)
r2_score(y_test, pred)

-0.8011411548052882

In [58]:
pred_df["prediction_S3_C3"] = model.predict(pred_df)

In [None]:
%%time
predictions_per_selection = [None]*4
for i in slctn_number_vals:
    best_trials_slctn_clusters[i] = [None]*4
    for j in cluster_vals:
        print("--------------")
        print(f"Current Selection Number is {i} and Cluster Number is {j}")
        X_train, y_train, X_test, y_test = train_test_split(dataframes_slctn_cluster[i][j])
        with open(f'model/model_params_{i}_{j}.pkl', 'rb') as f:
            params = pickle.load(f)
        params["verbose"] = -1
        model = lgb.LGBMRegressor(**params, )
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        r2 = r2_score(y_test, pred)
        mae = mean_absolute_error(y_test, pred)
        print(f"R2 = {r2}\tMAE={mae}")
        best_trials_slctn_clusters[i][j] = pred
    pred_df[f"prediction_S{i}"] = model.predict(sum(best_trials_slctn_clusters[i]))

In [50]:
def pension_calc(start_payment, payment, age_start, age_finish, retirement_years, percent=0.06,):
    years = age_finish-age_start
    V = payment * ((1 + percent)**years - 1) / percent
    V += start_payment * (1 + percent)**years
    pension = V / 12 / retirement_years
    return pension