In [1]:
import numpy as np
import pandas as pd

from datetime import datetime, timedelta

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor

import plotly.express as px
import plotly.graph_objects as go

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv("dataset/ngp.csv")

In [3]:
dateRange = pd.DataFrame(pd.date_range(start='2000-08-30',end='2020-12-31'),columns=["all_dates"])

In [4]:
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
data = df[["Date", "Close"]]

In [6]:
def date_decomposition(data_frame, date_col):
    data_frame = data_frame.copy()
    
    data_frame['day_of_week'] = data_frame[date_col].apply(lambda x: x.day_of_week)
    data_frame['month'] = data_frame[date_col].apply(lambda x: x.month)
    data_frame['year'] = data_frame[date_col].apply(lambda x: x.year)
    data_frame['day'] = data_frame[date_col].apply(lambda x: x.day)
    
    return data_frame   

In [7]:
data = date_decomposition(data, 'Date')

In [8]:
data = data.set_index('Date')

In [9]:
data.head()

Unnamed: 0_level_0,Close,day_of_week,month,year,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-30,4.805,2,8,2000,30
2000-08-31,4.78,3,8,2000,31
2000-09-01,4.835,4,9,2000,1
2000-09-05,4.96,1,9,2000,5
2000-09-06,5.065,2,9,2000,6


In [10]:
#taking log of the close price
data['Close'] = np.log(data['Close'])

In [11]:
train_end = datetime(2020, 12, 31)

In [12]:
train_data = data[:train_end]
val_test_data = data[train_end+timedelta(1):]

In [13]:
val_test_len = val_test_data.shape[0]//2

In [14]:
val_test_data = val_test_data.reset_index()
val_test_data.index = list(val_test_data.index)

In [15]:
val_data= val_test_data[:val_test_len]
test_data = val_test_data[val_test_len:]

In [16]:
train_data = train_data.reset_index()

In [17]:
len(dateRange), len(train_data)

(7429, 5106)

In [18]:
train_data = pd.merge(dateRange,train_data,how='left',left_on="all_dates",right_on="Date")

In [19]:
train_data.drop("Date",axis=1,inplace=True)

In [20]:
train_data.head(5)

Unnamed: 0,all_dates,Close,day_of_week,month,year,day
0,2000-08-30,1.569657,2.0,8.0,2000.0,30.0
1,2000-08-31,1.564441,3.0,8.0,2000.0,31.0
2,2000-09-01,1.575881,4.0,9.0,2000.0,1.0
3,2000-09-02,,,,,
4,2000-09-03,,,,,


In [21]:
train_data = date_decomposition(train_data, 'all_dates')

In [22]:
train_data.head()

Unnamed: 0,all_dates,Close,day_of_week,month,year,day
0,2000-08-30,1.569657,2,8,2000,30
1,2000-08-31,1.564441,3,8,2000,31
2,2000-09-01,1.575881,4,9,2000,1
3,2000-09-02,,5,9,2000,2
4,2000-09-03,,6,9,2000,3


In [23]:
train_data.rename(columns={'all_dates':'Date'}, inplace=True)

In [24]:
def get_scores(model, train, val_set, test_set):
    X_train = train.drop(['Date', 'Close'], axis=1)
    y_train = train['Close']
    
    X_val = val_set.drop(['Date', 'Close'], axis=1)
    y_val = val_set['Close']
    
    X_test = test_set.drop(['Date', 'Close'], axis=1)
    y_test = test_set['Close']
    
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    train_rmse = np.sqrt(train_mse)
    val_rmse = np.sqrt(val_mse)
    test_rmse = np.sqrt(test_mse)
    
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print(f"Train MSE={train_mse}, Train RMSE={train_rmse}")
    print(f"Val MSE={val_mse}, Val RMSE={val_rmse}")
    print(f"Test MSE={test_mse}, Train RMSE={test_rmse}")
    print('#'*50)
    print(f"Train R2 Score={train_r2}")
    print(f"Val R2 Score={val_r2}")
    print(f"Test R2 Score={test_r2}")

## Rolling Mean Imputation

In [25]:
def moving_avg(series, window):
    series = list(series)
    
    rolling_mean=list(series[:window])
    
    out = list(series[:window])
    
    for i in range(window,len(series)-1):
    
        mean = np.mean(rolling_mean)          

        if np.isnan(series[i]):
            rolling_mean.append(mean)
            
        else:
            rolling_mean.append(series[i])
            
        rolling_mean = rolling_mean[1:]
        
        
        
        out.append(mean)
        
    out.append(np.mean(rolling_mean))
    
    return np.array(out)    

In [26]:
rolling_mean_close = pd.DataFrame(moving_avg(train_data["Close"],3),columns=["rolling"])

In [27]:
rolling_mean_close.head()

Unnamed: 0,rolling
0,1.569657
1,1.564441
2,1.575881
3,1.569993
4,1.570105


In [28]:
len(rolling_mean_close), len(train_data["Close"])

(7429, 7429)

In [29]:
close_val = train_data["Close"].values
rolling_val = rolling_mean_close.values

## Rolling Mean Imputation

In [30]:
for i in range(len(close_val)):
    if np.isnan(close_val[i]):
        close_val[i]= rolling_val[i]

In [31]:
train_data["Close"] = rolling_val

In [33]:
train_data[["Date", "Close"]].to_csv("dataset/rolling_mean_imputed.csv", index=False)

In [34]:
rf = RandomForestRegressor(n_jobs=-1)

In [35]:
get_scores(rf, train_data, val_data, test_data)

Train MSE=4.5552366394300445e-05, Train RMSE=0.006749249320798606
Val MSE=0.3461483896758579, Val RMSE=0.5883437682816551
Test MSE=1.2939251817059934, Train RMSE=1.1375083215985689
##################################################
Train R2 Score=0.9997781973034777
Val R2 Score=-3.9682738725965354
Test R2 Score=-14.046665792519613


## KNN Imputation

In [37]:
from sklearn.impute import KNNImputer

In [38]:
X_train = train_data.drop(['Date', 'Close'], axis=1)
y_train = train_data['Close']

In [39]:
imputer = KNNImputer()

In [43]:
y_train_imputed = imputer.fit_transform(y_train.values.reshape(-1,1), X_train)

In [44]:
imputed_train = train_data.copy()

In [45]:
imputed_train['Close'] = y_train_imputed

In [46]:
rf = RandomForestRegressor(n_jobs=-1)

In [47]:
get_scores(rf, imputed_train, val_data, test_data)

Train MSE=4.640857867072009e-05, Train RMSE=0.006812384213380811
Val MSE=0.34546179362501955, Val RMSE=0.587759979604787
Test MSE=1.292624068551646, Train RMSE=1.136936264067448
##################################################
Train R2 Score=0.9997740282513135
Val R2 Score=-3.958419147507094
Test R2 Score=-14.031535539960572


### KNN imputer and rolling mean imputation both have given almost similar results