In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import plotly.graph_objects as go

## More stupid Preprocessing

In [2]:
INFLATION_FILEPATH = "Processed Datasets/inflation.csv"
INTEREST_FILEPATH = "Processed Datasets/interest.csv"
CPI_FILEPATH = 'Processed Datasets/reformatted/PHILIPPINES.csv'

# PROCESSED_OUTPUT_DIR = 'Processed Datasets/'
# PROCESSED_DATASET_SPLIT_STAGE_DIR = PROCESSED_OUTPUT_DIR + 'reformatted/'
# FORECAST_DATASET_DIR = "Forecasts/"


In [3]:
# files = [f for f in os.listdir(PROCESSED_DATASET_SPLIT_STAGE_DIR) if os.path.isfile(os.path.join(PROCESSED_DATASET_SPLIT_STAGE_DIR, f))]

In [4]:
inflation_csv = pd.read_csv(INFLATION_FILEPATH)
interest_csv = pd.read_csv(INTEREST_FILEPATH)
cpi_csv = pd.read_csv(CPI_FILEPATH)


In [5]:
cpi_csv['Date'] = pd.to_datetime(cpi_csv['Date'], format='%Y-%m')
cpi_csv['Year'] = cpi_csv['Date'].dt.year
cpi_csv['Month'] = cpi_csv['Date'].dt.month

cpi_csv.set_index('Date')
cpi_csv

Unnamed: 0.1,Unnamed: 0,Date,Food_and_drinks,Alcohol_and_tobacco,Household_consumable_goods,Medication,Stationery,Year,Month
0,0,2018-01-01,97.1,92.0,98.3,98.6,98.1,2018,1
1,1,2018-02-01,97.4,96.3,98.5,98.9,98.3,2018,2
2,2,2018-03-01,97.6,98.1,99.1,99.2,98.6,2018,3
3,3,2018-04-01,98.0,99.3,99.3,99.4,98.7,2018,4
4,4,2018-05-01,98.1,99.9,99.4,99.7,99.1,2018,5
...,...,...,...,...,...,...,...,...,...
76,76,2024-05-01,128.7,176.8,125.5,120.1,130.5,2024,5
77,77,2024-06-01,129.5,177.0,125.6,120.3,130.8,2024,6
78,78,2024-07-01,130.4,177.0,125.8,120.6,131.7,2024,7
79,79,2024-08-01,130.4,177.2,126.0,120.7,133.3,2024,8


In [6]:
interest_csv['Date'] = pd.to_datetime(interest_csv['Date'], format='%Y-%m')
interest_csv.set_index('Date')
interest_csv

Unnamed: 0.1,Unnamed: 0,Date,Year,Month,Interest_rate
0,0,2018-01-01,2018,1,4.5
1,1,2018-02-01,2018,2,4.5
2,2,2018-03-01,2018,3,4.5
3,3,2018-04-01,2018,4,4.5
4,4,2018-05-01,2018,5,4.5
...,...,...,...,...,...
78,78,2024-05-01,2024,5,6.0
79,79,2024-06-01,2024,6,6.0
80,80,2024-07-01,2024,7,6.0
81,81,2024-08-01,2024,8,6.0


In [7]:
merged_data = pd.merge(cpi_csv, inflation_csv, how='inner', left_on=['Year', 'Month'], right_on=['Year', 'Month'])

In [8]:
merged_data = pd.merge(merged_data, interest_csv, how='inner', left_on=['Date'], right_on=['Date'])
print(merged_data.isnull().sum())

Unnamed: 0_x                  0
Date                          0
Food_and_drinks               0
Alcohol_and_tobacco           0
Household_consumable_goods    0
Medication                    0
Stationery                    0
Year_x                        0
Month_x                       0
Unnamed: 0_y                  0
Rate                          0
Unnamed: 0                    0
Year_y                        0
Month_y                       0
Interest_rate                 0
dtype: int64


In [9]:
merged_data

Unnamed: 0.1,Unnamed: 0_x,Date,Food_and_drinks,Alcohol_and_tobacco,Household_consumable_goods,Medication,Stationery,Year_x,Month_x,Unnamed: 0_y,Rate,Unnamed: 0,Year_y,Month_y,Interest_rate
0,12,2019-01-01,102.0,105.2,102.0,101.5,102.0,2019,1,0,4.4,14,2019,1,4.625000
1,13,2019-02-01,101.4,106.9,102.2,101.6,102.1,2019,2,6,3.8,15,2019,2,4.666667
2,14,2019-03-01,100.7,108.1,102.5,101.9,102.3,2019,3,12,3.4,16,2019,3,4.708333
3,15,2019-04-01,100.7,108.5,102.7,102.1,102.6,2019,4,18,3.2,17,2019,4,4.750000
4,16,2019-05-01,100.9,108.8,102.9,102.4,103.0,2019,5,24,3.2,18,2019,5,4.625000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,76,2024-05-01,128.7,176.8,125.5,120.1,130.5,2024,5,29,3.9,78,2024,5,6.000000
65,77,2024-06-01,129.5,177.0,125.6,120.3,130.8,2024,6,35,3.7,79,2024,6,6.000000
66,78,2024-07-01,130.4,177.0,125.8,120.6,131.7,2024,7,41,4.4,80,2024,7,6.000000
67,79,2024-08-01,130.4,177.2,126.0,120.7,133.3,2024,8,47,3.3,81,2024,8,6.000000


In [10]:
merged_data = merged_data.loc[:, ~merged_data.columns.str.contains('^Unnamed')]
merged_data = merged_data.loc[:, ~merged_data.columns.str.contains('^Year')]
merged_data = merged_data.loc[:, ~merged_data.columns.str.contains('^Month')]
merged_data

Unnamed: 0,Date,Food_and_drinks,Alcohol_and_tobacco,Household_consumable_goods,Medication,Stationery,Rate,Interest_rate
0,2019-01-01,102.0,105.2,102.0,101.5,102.0,4.4,4.625000
1,2019-02-01,101.4,106.9,102.2,101.6,102.1,3.8,4.666667
2,2019-03-01,100.7,108.1,102.5,101.9,102.3,3.4,4.708333
3,2019-04-01,100.7,108.5,102.7,102.1,102.6,3.2,4.750000
4,2019-05-01,100.9,108.8,102.9,102.4,103.0,3.2,4.625000
...,...,...,...,...,...,...,...,...
64,2024-05-01,128.7,176.8,125.5,120.1,130.5,3.9,6.000000
65,2024-06-01,129.5,177.0,125.6,120.3,130.8,3.7,6.000000
66,2024-07-01,130.4,177.0,125.8,120.6,131.7,4.4,6.000000
67,2024-08-01,130.4,177.2,126.0,120.7,133.3,3.3,6.000000


In [11]:
merged_data.to_csv("final_merged_dataset.csv")

## Experimentation with feature selection

In [12]:
merged_data.set_index(merged_data['Date'])
split_date = datetime.datetime(2024,1,1)
train_data = merged_data[merged_data['Date'] <= split_date]
test_data = merged_data[merged_data['Date'] >= split_date]

### All feature Set

In [49]:
model = ExponentialSmoothing(train_data['Alcohol_and_tobacco'], trend='additive', seasonal='additive', seasonal_periods=12)
fitted_model = model.fit()
forecast_horizon = 20
forecast = fitted_model.forecast(forecast_horizon)
print(forecast)

forecast_df = train_data.copy()
forecast_df = pd.concat([forecast_df["Alcohol_and_tobacco"], pd.Series(forecast)])
forecast_df

61    174.982612
62    175.685980
63    176.070158
64    176.115100
65    176.200785
66    176.327227
67    176.414416
68    176.822390
69    177.111091
70    177.760573
71    178.030830
72    178.721875
73    179.004487
74    179.707854
75    180.092033
76    180.136974
77    180.222660
78    180.349102
79    180.436291
80    180.844265
dtype: float64


0     105.200000
1     106.900000
2     108.100000
3     108.500000
4     108.800000
         ...    
76    180.136974
77    180.222660
78    180.349102
79    180.436291
80    180.844265
Length: 81, dtype: float64

In [50]:
fig = go.Figure()

# Add traces for training data, testing data, and forecast
fig.add_trace(go.Scatter(x=train_data.index, y=train_data['Alcohol_and_tobacco'], name='Training Data'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['Alcohol_and_tobacco'], name='Testing Data'))
fig.add_trace(go.Scatter(x=forecast_df.index, y=forecast_df, name='Forecast'))

# Customize the plot
fig.update_layout(
    title='Extrapolation Smoothing Forecast',
    xaxis_title='Date',
    yaxis_title='Value',
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99)
)

# Show the plot
fig.show()