In [1]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from statsmodels.tsa.holtwinters import SimpleExpSmoothing

import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline 

import warnings
warnings.simplefilter('ignore')

In [2]:
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)

#### read csv files from kaggle 

In [3]:
df_gemini_price_d = pd.read_csv('datasets/kaggle/BTCUSD_day.csv')
df_gemini_price_1h = pd.read_csv('datasets/kaggle/BTCUSD_1hr.csv')

#### convert columns to datetime

In [4]:
df_gemini_price_d['Date'] = pd.to_datetime(df_gemini_price_d['Date'])
df_gemini_price_1h['Date'] = pd.to_datetime(df_gemini_price_1h['Date'])

#### verify the range of datetime in dataframes

In [5]:
print('------------', 'df_gemini_price_d', '------------')
print(df_gemini_price_d['Date'].max())
print(df_gemini_price_d['Date'].min())
print('------------', 'df_gemini_price_1h', '------------')
print(df_gemini_price_1h['Date'].max())
print(df_gemini_price_1h['Date'].min())

------------ df_gemini_price_d ------------
2020-04-09 00:00:00
2015-10-08 00:00:00
------------ df_gemini_price_1h ------------
2020-04-09 00:00:00
2015-10-08 13:00:00


#### verify the numbers of lines in dataframes

In [6]:
print('------------', 'df_coinmarket_price_d', '------------')
print(df_gemini_price_d.shape[0])
print('------------', 'df_coinmarket_price_1h', '------------')
print(df_gemini_price_1h.shape[0])

------------ df_coinmarket_price_d ------------
1646
------------ df_coinmarket_price_1h ------------
39468


In [12]:
class ExponentialSmoothingForecast(object):
    
    def __init__(self, df, percentage_train_size):
        
        self.df = df
        self.percentage_train_size = percentage_train_size
                
    def perform(self):
        
        best_params, train_size, y_train_predict = self.grid_search_model()
        
        array_pred = np.array([])
        
        date_test = self.df[train_size:].index
        
        for current_datetime in date_test:

            pred = self.prediction(best_params = best_params, current_datetime = current_datetime)
            array_pred = np.append(array_pred, pred)
#             print('current_datetime', current_datetime)
#             print('pred', pred)
        
        
        date_train = self.df[:train_size].index
        y_train = self.df[:train_size]['Close'].values
        
        date_test = self.df[train_size:].index
        y_test = self.df[train_size:]['Close'].values
        
        data_train = {
          'date_train': date_train,
          'y_train': y_train,
          'y_train_predict': y_train_predict
        }

        data_test = {
            'date_test': date_test,
            'y_test': y_test,
            'y_test_predict': array_pred
        }

        df_train = pd.DataFrame(data_train)
        df_test = pd.DataFrame(data_test)

        fig, ax = plt.subplots(figsize=(18, 10))
        
#         ax.plot(df_train['date_train'], df_train['y_train'], label = 'train', color="blue")
#         ax.plot(df_train['date_train'], df_train['y_train_predict'], '--', label = 'train predict', color="red")

        ax.plot(df_test['date_test'], df_test['y_test'], label = 'test', color="blue")
        ax.plot(df_test['date_test'], df_test['y_test_predict'], '--', label = 'test predict', color="red")
        
        ax.legend()
        plt.show()
        mae, mape, mse, rmse, rmspe, r2, pearson = self.perform_metrics(df_test['y_test'],
                                                                        df_test['y_test_predict'])
        print('mae', mae,
              'mape', mape,
              'mse', mse,
              'rmse', rmse,
              'rmspe', rmspe,
              'r2', r2,
              'pearson', pearson)

        
        return df_train, df_test
       
    def prediction(self, best_params, current_datetime):
        
        alpha = best_params['alpha']

        model = SimpleExpSmoothing(self.df[:current_datetime])
        fit_model = model.fit(smoothing_level = alpha)
                
        pred = fit_model.predict( start = current_datetime, end = current_datetime)

        return pred
    
    def grid_search_model(self):
        
        best_aic = np.inf
        train_size = int(len(self.df) * self.percentage_train_size)
        
        model = SimpleExpSmoothing(self.df[:train_size])
        list_alpha = [a/100 for a in range(0, 100, 10)]
        
        for alpha in list_alpha:
                
            fit_model = model.fit(smoothing_level = alpha)

            if fit_model.aic < best_aic:
                best_aic = fit_model.aic
                best_fit = fit_model
                best_params = {'alpha': alpha}
        
        print('AIC', best_aic, 'best_params', best_params)
        
        y_train_predict = best_fit.fittedvalues
               
        return best_params, train_size, y_train_predict
    
    def perform_metrics(self, y_truth, y_forecasted): 

        mae = round(mean_absolute_error(y_truth, y_forecasted) ,4)
        mape = round(np.mean(np.abs((y_truth - y_forecasted) / y_truth)) * 100, 4)
        mse = round(mean_squared_error(y_truth, y_forecasted), 4)
        rmse = round(mse**(0.5), 4)
        rmspe = round((np.sqrt(np.mean(np.square((y_truth - y_forecasted) / y_truth)))) * 100, 4)
        r2 = round(r2_score(y_truth, y_forecasted), 4)
        try:
            corr, _ = pearsonr(y_truth, y_forecasted)
            pearson = round(corr, 4)
        except:
            pearson = None

        return mae, mape, mse, rmse, rmspe, r2, pearson
    

In [None]:
### MAIN

# df = df_gemini_price_d.copy()
df = df_gemini_price_1h.copy()

df = df.set_index('Date')
df = df.sort_values(by = ['Date'])
df = df.loc[:, ['Close']]

df = df.asfreq('H') # D or H

exponential_smoothing_forecast = ExponentialSmoothingForecast(df = df,
                                             percentage_train_size = 0.80
                                            )
df_train, df_test = exponential_smoothing_forecast.perform()

AIC 265223.8722569918 best_params {'alpha': 0.9}


In [14]:
#ES 1d
#alpha = 0.9
#mae 255.0775 mape 2.9498 mse 158088.9172 rmse 397.604 rmspe 5.2742 r2 0.9311 pearson 0.9654

In [None]:
#ES 1h
#AIC 265223.8722569918 best_params {'alpha': 0.9}
