In [None]:
#decision tress

In [None]:
!pip install pmdarima

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pmdarima
  Downloading pmdarima-1.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 22.3 MB/s 
[?25hCollecting statsmodels!=0.12.0,>=0.11
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 47.1 MB/s 
Installing collected packages: statsmodels, pmdarima
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.10.2
    Uninstalling statsmodels-0.10.2:
      Successfully uninstalled statsmodels-0.10.2
Successfully installed pmdarima-1.8.5 statsmodels-0.13.2


In [None]:
!pip install -U statsmodels

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

from scipy.stats import boxcox
from scipy.stats import yeojohnson

from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")


features = ['Date', 'Water']
start_index = pd.to_datetime("2021-07-01")
end_index = pd.to_datetime("2022-12-31")


def prepare_data(file_path):
    data = pd.read_csv(file_path)
    
    # select feature
    data_1 = data[features]
    data_1['Date'] = pd.to_datetime(data_1['Date'])
    
    # resample dataframe monthly
    data_resampled = (data_1.set_index('Date')).resample('M', convention = 'end').mean().bfill(axis = 0)
    
    # create Water series
    water_data = pd.DataFrame(data_resampled['Water'], index = data_resampled.index)
    
    # Square root transform Water Series
    transformed_water, lam = yeojohnson(water_data['Water'])
    
    # Create Series y for modelling
    y = pd.Series(transformed_water, index = water_data.index)
    return lam, y


def split_series(y):
    cutoff_test = int(len(y) * 0.85)
    
    y_train = y.iloc[:cutoff_test]
    y_test = y.iloc[cutoff_test:]

    return y_train, y_test


def apply_model(y_train, y_test):
    model = AutoReg(y_train, lags = 59, old_names = False).fit()
    
    # Generate out-sample predictions
    y_pred_test = model.predict(y_test.index.min(), y_test.index.max())
                                
    # print test MAE
    print(f"AutoReg Model Test MAE: {mean_absolute_error(y_test, y_pred_test)}")
    
    return model


def evaluate_for_submission(model, lam):
    # forecast Water values for the rest for the rest of 2021 and 2022
    forecast = model.predict(start = start_index, end = end_index)
    
    # transformed Water values for next months
    forecast_trans = round(np.exp(np.log(lam * forecast + 1) / lam), 1)
    print(f"Forecast:\n{forecast_trans}")
    return forecast_trans


def write_prediction_data(forecast_trans):
    
    predictions = pd.DataFrame(forecast_trans)
    predictions.reset_index(inplace = True)
    predictions.rename(columns = {predictions.columns[0]: 'Date', predictions.columns[1]: 'Water'}, inplace = True)

    predictions.to_csv('drive/MyDrive/Tech Pet/Data/PredictionFile.csv', index=False)
   
    

def __main__():
    lam, y = prepare_data('drive/MyDrive/Tech Pet/Data/data_cleaned.csv')
    y_train, y_test = split_series(y)
    model = apply_model(y_train, y_test)
    forecast = evaluate_for_submission(model, lam)
    write_prediction_data(forecast)


__main__()

AutoReg Model Test MAE: 1.428945473869008
Forecast:
2021-07-31    1175.2
2021-08-31    1177.9
2021-09-30    1180.9
2021-10-31    1181.8
2021-11-30    1180.9
2021-12-31    1181.0
2022-01-31    1178.7
2022-02-28    1177.9
2022-03-31    1176.4
2022-04-30    1175.8
2022-05-31    1172.5
2022-06-30    1169.6
2022-07-31    1166.2
2022-08-31    1165.1
2022-09-30    1164.1
2022-10-31    1164.3
2022-11-30    1163.3
2022-12-31    1162.9
Freq: M, dtype: float64


In [None]:
import pandas as pd
import numpy as np

from scipy.stats import boxcox

from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")


features = ['Date', 'Gas']
start_index = pd.to_datetime("2021-07-01")
end_index = pd.to_datetime("2022-12-31")


def prepare_data(file_path):
    data = pd.read_csv(file_path)
    
    # select feature
    data_1 = data[features]
    data_1['Date'] = pd.to_datetime(data_1['Date'])
    
    # resample dataframe monthly
    data_resampled = (data_1.set_index('Date')).resample('M', convention = 'end').mean().bfill(axis = 0)
    
    # create Gas series
    Gas_data = pd.DataFrame(data_resampled['Gas'], index = data_resampled.index)
    
    # Square root transform Gas Series
    transformed_gas, lam = boxcox(Gas_data['Gas'])
    
    # Create Series y for modelling
    y = pd.Series(transformed_gas, index = Gas_data.index)
    return lam, y


def split_series(y):
    cutoff_test = int(len(y) * 0.85)
    
    y_train = y.iloc[:cutoff_test]
    y_test = y.iloc[cutoff_test:]

    return y_train, y_test


def apply_model(y_train, y_test):
    model = AutoReg(y_train, lags = 59, old_names = False).fit()
    
    # Generate out-sample predictions
    y_pred_test = model.predict(y_test.index.min(), y_test.index.max())
                                
    # print test MAE
    print(f"AutoReg Model Test MAE: {mean_absolute_error(y_test, y_pred_test)}")
    
    return model


def evaluate_for_submission(model, lam):
    # forecast Gas values for the rest for the rest of 2021 and 2022
    forecast = model.predict(start = start_index, end = end_index)
    
    # transformed Gas values for next months
    forecast_trans = round(np.exp(np.log(lam * forecast + 1) / lam), 1)
    print(f"Forecast:\n{forecast_trans}")
    return forecast_trans


def write_prediction_data(forecast_trans):
    
    predictions = pd.DataFrame(forecast_trans)
    predictions.reset_index(inplace = True)
    predictions.rename(columns = {predictions.columns[0]: 'Date', predictions.columns[1]: 'Gas'}, inplace = True)

    predictions.to_csv('drive/MyDrive/TechPet/Project 1/PredictionFile.csv', index=False)
   
    

def __main__():
    lam, y = prepare_data('drive/MyDrive/TechPet/Project 1/data_cleaned.csv')
    y_train, y_test = split_series(y)
    model = apply_model(y_train, y_test)
    forecast = evaluate_for_submission(model, lam)
    write_prediction_data(forecast)


__main__()