In [33]:
import numpy as np
import pandas as pd

from datetime import datetime, timedelta

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv("dataset/ngp.csv")

In [3]:
dateRange = pd.DataFrame(pd.date_range(start='2000-08-30',end='2020-12-31'),columns=["All_date"])

In [4]:
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
data = df[["Date", "Close"]]

In [6]:
data['day_of_week'] = data['Date'].apply(lambda x: x.day_of_week)
data['month'] = data['Date'].apply(lambda x: x.month)
data['year'] = data['Date'].apply(lambda x: x.year)
data['day'] = data['Date'].apply(lambda x: x.day)

In [7]:
data = data.set_index('Date')

In [8]:
data.head()

Unnamed: 0_level_0,Close,day_of_week,month,year,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-30,4.805,2,8,2000,30
2000-08-31,4.78,3,8,2000,31
2000-09-01,4.835,4,9,2000,1
2000-09-05,4.96,1,9,2000,5
2000-09-06,5.065,2,9,2000,6


In [9]:
train_end = datetime(2020, 12, 31)

In [10]:
train_data = data[:train_end]
val_test_data = data[train_end+timedelta(1):]

In [11]:
val_test_len = val_test_data.shape[0]//2

In [12]:
val_test_data = val_test_data.reset_index()
val_test_data.index = list(val_test_data.index)

In [13]:
val_data= val_test_data[:val_test_len]
test_data = val_test_data[val_test_len:]

In [14]:
train_data = train_data.reset_index()

In [15]:
len(dateRange), len(train_data)

(7429, 5106)

In [16]:
new_df= pd.merge(dateRange,train_data,how='left',left_on="All_date",right_on="Date")

In [17]:
new_df.drop("Date",axis=1,inplace=True)

In [18]:
new_df.head(5)

Unnamed: 0,All_date,Close,day_of_week,month,year,day
0,2000-08-30,4.805,2.0,8.0,2000.0,30.0
1,2000-08-31,4.78,3.0,8.0,2000.0,31.0
2,2000-09-01,4.835,4.0,9.0,2000.0,1.0
3,2000-09-02,,,,,
4,2000-09-03,,,,,


In [19]:
def moving_avg(series, window):
    series = list(series)
    rolling_mean=list(series[:window])
    out = list(series[:window])
    for i in range(window,len(series)-1):
    
        mean = np.mean(rolling_mean)          

        if np.isnan(series[i]):
            
            rolling_mean.append(mean)
        else:
            rolling_mean.append(series[i])
        rolling_mean = rolling_mean[1:]
        
        
        
        out.append(mean)
    out.append(np.mean(rolling_mean))
    return np.array(out)
        
        

In [20]:
New_Close = pd.DataFrame(moving_avg(new_df["Close"],3),columns=["rolling"])

In [35]:
New_Close.head()

Unnamed: 0,rolling
0,4.805
1,4.78
2,4.835
3,4.806667
4,4.807222


In [48]:
New_Close.info()

NameError: name 'New_Close' is not defined

In [22]:
len(New_Close)

7429

In [23]:
len(new_df["Close"])

7429

In [24]:
new_df2 = pd.concat([new_df,New_Close],axis=1)

In [25]:
Close_arr = np.array(new_df2["Close"])
rolling_arr = np.array(new_df2["rolling"])

In [26]:
for i in range(len(Close_arr)):
    if np.isnan(Close_arr[i]):
        Close_arr[i]= rolling_arr[i]

In [27]:
new_df2["Close"]= Close_arr

In [28]:
new_df2.drop("rolling",axis=1,inplace=True)

In [29]:
def get_scores(model, train, val_set, test_set):
    X_train = train.drop(['Date', 'Close'], axis=1)
    y_train = train['Close']
    
    X_val = val_set.drop(['Date', 'Close'], axis=1)
    y_val = val_set['Close']
    
    X_test = test_set.drop(['Date', 'Close'], axis=1)
    y_test = test_set['Close']
    
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    train_rmse = np.sqrt(train_mse)
    val_rmse = np.sqrt(val_mse)
    test_rmse = np.sqrt(test_mse)
    
    print(f"Train MSE={train_mse}, Train RMSE={train_rmse}")
    print(f"Val MSE={val_mse}, Val RMSE={val_rmse}")
    print(f"Test MSE={test_mse}, Train RMSE={test_rmse}")


In [31]:
rf = RandomForestRegressor(n_jobs=-1)

In [34]:
get_scores(rf, train_data, val_data, test_data)

Train MSE=0.004747573711840336, Train RMSE=0.06890263936773638
Val MSE=3.0137288688828345, Val RMSE=1.736009466818322
Test MSE=19.93360350781842, Train RMSE=4.46470643019431
