# DATA 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Importing research data
https://github.com/WisnuHanif/reactor_data/blob/main/reactor_data.csv'
prep0 = pd.read_csv('https://raw.githubusercontent.com/WisnuHanif/reactor_data/main/reactor_data.csv')
prep0.head()

Unnamed: 0,Time,Running_cycle,FI-001,FI-002,TC-001,TC-002,DT-001,DT-002,DP-001,DP-002,...,TI-034,TI-035,TI-036,TI-037,TI-038,TI-039,TI-040,TI-041,TI-042,CONVERSION
0,6/29/2004 4:00,1,57.376325,669.985139,319.496265,270.298088,68.15938,42.059748,16.935505,0.96733,...,308.465166,293.210571,288.108362,312.395056,305.310342,303.123348,310.085243,309.464836,308.546075,93.200705
1,6/29/2004 4:00,2,57.415584,670.175315,319.506829,270.557335,68.149101,43.819212,16.906026,0.97428,...,310.060319,294.679554,289.054518,314.40908,307.335927,304.765668,311.408695,310.634102,309.389468,93.128145
2,6/29/2004 4:00,3,57.454843,670.365491,319.517393,270.816583,68.138821,45.578677,16.876548,0.981229,...,311.655473,296.148536,290.000675,316.423104,309.361511,306.407989,312.732147,311.803369,310.232862,93.055586
3,6/30/2004 0:00,4,57.543094,670.622632,319.553869,271.363961,68.380253,49.568133,16.811533,1.00076,...,315.28032,299.371886,292.08263,320.951668,313.762146,309.971053,315.635902,314.406282,312.10774,92.830864
4,6/30/2004 0:00,5,57.470024,670.828064,319.515243,270.964477,67.735901,45.934814,16.879419,0.977966,...,312.047943,296.581512,290.251892,316.9151,309.949554,306.832458,313.039825,312.000702,310.427032,93.200151


In [4]:
#Identity variables name
prep0.columns

Index(['Time', 'Running_cycle', 'FI-001', 'FI-002', 'TC-001', 'TC-002',
       'DT-001', 'DT-002', 'DP-001', 'DP-002', 'DP-003', 'DP-004', 'PI-001',
       'PI-002', 'PI-003', 'PI-004', 'AI-001', 'AI-002', 'AI-003', 'AI-004',
       'AI-005', 'AI-006', 'AI-007', 'AI-008', 'AI-009', 'AI-010', 'RX-001',
       'TI-001', 'TI-002', 'TI-005', 'TI-006', 'TI-007', 'TI-008', 'TI-009',
       'TI-010', 'TI-011', 'TI-012', 'TI-013', 'TI-014', 'TI-015', 'TI-018',
       'TI-019', 'TI-020', 'TI-021', 'TI-022', 'TI-023', 'TI-024', 'TI-025',
       'TI-026', 'TI-027', 'TI-028', 'TI-029', 'TI-030', 'TI-031', 'TI-032',
       'TI-033', 'TI-034', 'TI-035', 'TI-036', 'TI-037', 'TI-038', 'TI-039',
       'TI-040', 'TI-041', 'TI-042', 'CONVERSION'],
      dtype='object')

# DATA PREPARATION, CLEANING, AND FEATURE EXTRACTION

In [None]:
#Removing variable description & 'NO', 'Time' column
#prep1 = prep0.iloc[1:, :].drop(['Running_cycle','Time'], axis=1)
prep1 = prep0.iloc[:, :].drop(['FI-001','Running_cycle','Time'], axis=1)
prep1

In [None]:
#Convert timestamp object data to numerical
prep2 = prep1.apply(pd.to_numeric)
print(prep2.dtypes, prep2.shape)

In [None]:
#Check if there's missing value
prep2.isnull().sum()

In [None]:
#Removing data where plant is not run, by identifying total raw material 'FI-001' loss flow rate
sns.boxplot(data=prep2,x=prep2['FI-002'])

In [None]:
#Remove shut down data by identifying outlier FI-001 with Inter Quantile Range Method

from numpy import percentile
# calculate interquartile range
q25_a, q75_a = percentile(prep2['FI-002'], 25), percentile(prep2['FI-002'], 75)
iqr_a = q75_a - q25_a
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25_a, q75_a, iqr_a))
# calculate the outlier cutoff
cut_off_a = iqr_a * 1.5
lower_a, upper_a = q25_a - cut_off_a, q75_a + cut_off_a
print('Lower whisker=%.2f, Upper whisker=%.2f' % (lower_a, upper_a))
# identify outliers
shut_down_data = prep2[(prep2['FI-002']<lower_a)|(prep2['FI-002']>upper_a)]
print('Shut down data: %d' % len(shut_down_data))
# remove outliers
shut_down_removed = prep2[(prep2['FI-002']>lower_a)&(prep2['FI-002']<upper_a)]
print('Non-Shut down data: %d' % len(shut_down_removed))

In [None]:
#Check again if there's still outlier in 'FI-001'
sns.boxplot(data=shut_down_removed, x=shut_down_removed['FI-002'])

In [None]:
prep3 = shut_down_removed
prep3.shape

In [None]:
#Remove outlier for all variables while keeps the whole row intact
lb = prep3.quantile(0.01)
ub = prep3.quantile(0.99)

prep4 = prep3[(prep3 > lb) & (prep3 < ub)]
prep4

In [None]:
prep4.info()

In [None]:
#Check deleted value position
import missingno as mno
mno.matrix(prep4, figsize = (20, 6))

In [None]:
#Correlation matrix between variables before missing value imputation
#corr = prep4.corr()
#corr.style.background_gradient(cmap='coolwarm')

In [None]:
#corr.values[np.triu_indices_from(corr.values,1)].sum()

In [None]:
#Fill missing value (from removed outlier) with imputer
prep5 = prep4.interpolate(method ='linear', limit_direction ='forward')
prep5.head()

In [None]:
prep5.isnull().sum()

In [None]:
prep5.describe().transpose()

In [None]:
#Correlation matrix after data imputation
corr2 = prep5.corr()
corr2.style.background_gradient(cmap='coolwarm')

In [None]:
#Visualization plot for all variables
#group_1 = prep5.iloc[:,0:9]
#group_2 = prep5.iloc[:,9:18]
#group_3 = prep5.iloc[:,18:27]
#group_4 = prep5.iloc[:,27:36]
#group_5 = prep5.iloc[:,36:45]
#group_6 = prep5.iloc[:,45:54]
#group_7 = prep5.iloc[:,54:64]

In [None]:
#Plot for group 1
#group_1.plot(subplots =True, sharex = True, figsize = (30,80))
#Plot for group 2
#group_2.plot(subplots =True, sharex = True, figsize = (30,80))
#Plot for group 3
#group_3.plot(subplots =True, sharex = True, figsize = (30,80))
#Plot for group 4
#group_4.plot(subplots =True, sharex = True, figsize = (30,80))
#Plot for group 5
#group_5.plot(subplots =True, sharex = True, figsize = (30,80))
#Plot for group 6
#group_6.plot(subplots =True, sharex = True, figsize = (30,80))
#Plot for group 7
#group_7.plot(subplots =True, sharex = True, figsize = (40,40))

In [None]:
# Pearson correlation coefficient
select_corr = prep5.corr()["CONVERSION"].sort_values(ascending=False)[1:]

# absolute for positive values
abs_corr = abs(select_corr)

# random threshold for features to keep
selected_features = abs_corr[abs_corr>0.4]
selected_features

In [None]:
len(selected_features)

In [None]:
# Drop low correlation features
prep6 = prep5[selected_features.index].interpolate(method ='linear', limit_direction ='backward')
prep6.isnull().sum()

In [None]:
y_lasso = prep5["CONVERSION"]
X_lasso = prep6

In [None]:
#remove collinearity by removing irrelavant features with ebbedded method
from sklearn.linear_model import LassoCV
reg = LassoCV()
reg.fit(X_lasso, y_lasso)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X_lasso,y_lasso))
coef = pd.Series(reg.coef_, index = X_lasso.columns)

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
#Visualize important feature
imp_coef = coef.sort_values()
plt.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")

In [None]:
#Select relevant reatures
abs_coef = abs(coef)
relevant_features = abs_coef[abs_coef>0]
prep7 = prep6[relevant_features.index].sort_index(axis=1, ascending=True)
#prep7 = X_lasso
prep7['CONVERSION'] = prep5["CONVERSION"]
#prep7['CONVERSION'] = y_lasso
prep7.shape

In [None]:
#Showing correlation between relevant features
corr3 = prep7.corr().style.background_gradient(cmap='coolwarm')
corr3

In [None]:
#Drop high correlation variables among predictor
prep7 = prep7.drop(['TI-007','TI-011', 'TI-014'], axis = 1)

In [None]:
#Data Scaling with normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# transform data
scaled_data = pd.DataFrame(scaler.fit_transform(prep7), columns = prep7.columns)
print(scaled_data)

In [None]:
scaled_data.describe()

In [None]:
#Select data for predictors and target

#Import the features
X_df = scaled_data.drop('CONVERSION', axis = 1)
X = scaled_data.drop('CONVERSION', axis = 1).values.astype(float).reshape(-1, len(scaled_data.columns)-1)

# Extract the target
y_df = scaled_data['CONVERSION']
y = scaled_data['CONVERSION'].values.astype(float)

print(X.shape, y.shape)

In [None]:
#Create function to return conversion scale for later use
def return_conversion_scale(variable_plot):
    minimum = min(prep7['CONVERSION'])
    maximum = max(prep7['CONVERSION'])
    return ((variable_plot - min(variable_plot)) /(max(variable_plot) - min(variable_plot)))*(maximum-minimum)+minimum

In [None]:
#Divide data into training 70%, validation 15% and testing 15%
#In this first step, we will divide data which will be used to train the model and as prediction
#1st splitting : training (85%); testing (15%)
y_tr_size = int(len(y) * 0.85)
y_tr, y_test = y[0:y_tr_size], y[y_tr_size:len(y)]
X_tr_size = int(len(X) * 0.85)
X_tr, X_test = X[0:X_tr_size], X[X_tr_size:len(X)]
print('Observations: %d' % (len(y)))
print('Training Observations: %d' % (len(y_tr)))
print('Testing Observations: %d' % (len(y_test)))
plt.subplots(figsize=(8,4))
plt.plot(y_tr)
plt.plot([None for i in y_tr] + [X for X in y_test])
plt.show()

In [None]:
print('Training Features Shape:', X_tr.shape)
print('Training Targets Shape:', X_test.shape)
print('Testing Features Shape:', y_tr.shape)
print('Testing Targets Shape:', y_test.shape)

In [None]:
#Import and initialize the cross-validation iterator
#In the second step, we split training data from 1st step into training (80%); validation (20%)
from sklearn.model_selection import TimeSeriesSplit

n_splits = 5
ts_split = TimeSeriesSplit(n_splits)

fig, ax = plt.subplots(figsize=(10,5))
for ii, (tr, tt) in enumerate(ts_split.split(X_tr, y_tr)):
    
    #Plot training and testing indices
    l1 = ax.scatter(tr, [ii] * len(tr), c=[plt.cm.coolwarm(.1)],
                    marker='_', lw=15)
    l2 = ax.scatter(tt, [ii] * len(tt), c=[plt.cm.coolwarm(.9)],
                    marker='_', lw=15)
    ax.set(ylim=[n_splits, -.2], yticks=np.arange(n_splits), title='TimeSeriesSplit Behavior', xlabel='Data index', ylabel='CV iteration')
    ax.legend([l1, l2], ['Training', 'Validation'])

In [None]:
for train_index, test_index in ts_split.split(X_tr, y_tr):
     print("TRAIN:", train_index, "TEST:", test_index)

     #To get the indices 
     X_train, X_val = X_tr[train_index], X_tr[test_index]
     y_train, y_val = y_tr[train_index], y_tr[test_index]

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Targets Shape:', y_train.shape)
print('Validation Features Shape:', X_val.shape)
print('Validation Targets Shape:', y_val.shape)

In [None]:
#Measurement metrics for inverted scale
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score 
from sklearn.linear_model import LinearRegression
from math import log
    
def model_metrics(actual, predict):
    ac = return_conversion_scale(actual)
    pr = return_conversion_scale(predict)
    print('Mean Absolute Error:', metrics.mean_absolute_error(ac, pr))
    print('Mean Squared Error:', metrics.mean_squared_error(ac, pr))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ac, pr)))
    print('Coefficient of Determination:', r2_score(ac, pr)) 
    
    # calculate aic for regression
    lr_model = LinearRegression()
    lr_model.fit(pr.reshape(-1, 1), ac.reshape(-1, 1))
    # number of parameters
    num_params = len(lr_model.coef_) + 1
    # predict the training set
    yhat = lr_model.predict(pr.reshape(-1, 1))
    # calculate the error
    mse = metrics.mean_squared_error(ac, yhat)
    # calculate the aic
    aic = len(ac) * log(mse) + 2 * num_params
    print('Akaike Information Criterion: %.3f' % aic)
    
    # calculate bic for regression
    bic = len(ac) * log(mse) + num_params * log(len(ac))
    print('Bayesian Information Criterion: %.3f' % bic)

# MACHINE LEARNING MODEL : RANDOM FORREST REGRESSION

In [None]:
feature_list = list(X_df.columns)
feature_names = X_df.columns

In [None]:
#Run Ramdom Forest Regressor without hyper parameter tuning (default)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, oob_score = True)
regressor.fit(X_train, y_train) # change this

In [None]:
#Generate Regressor score and OOB Score of the model
#print("\nRegressor Score " + str(regressor.score(X_train, y_train)), "\nOOB Score " + str(regressor.oob_score_)) # change this

In [None]:
rfr_model_pred = regressor.predict(X_val)
rfr_model_pred

In [None]:
# evaluate predictions
model_metrics(y_val, rfr_model_pred)

In [None]:
#test_targets = test_targets.values

In [None]:
#Plot actual vs prediction
with plt.style.context('default'):
    plt.figure()
    plt.subplots(figsize=(8,4))
    plt.plot(return_conversion_scale(y_val), label = "Actual Conversion")
    plt.plot(return_conversion_scale(rfr_model_pred), label = "Prediction")
    plt.grid()
    plt.title('Random Forrest Regression Prediction')
    plt.xlabel('Time')
    plt.ylabel('Conversion (%)')
    plt.legend(loc='best')
    plt.show()

In [None]:
fi = pd.DataFrame({'feature': feature_list,
                   'importance': regressor.feature_importances_}).\
                    sort_values('importance', ascending = False)
fi

In [None]:
#Random Search with Cross Validation
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 39)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 5, 6, 7, 8, 9, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_squared_error', 
                              cv = ts_split, verbose=2, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_random.cv_results_

In [None]:
rf_random.best_estimator_

In [None]:
best_random = rf_random.best_estimator_
best_pred = best_random.predict(X_val)
best_pred

In [None]:
#Evaluate the Best Random Search Model

model_metrics(y_val, best_pred)

In [None]:
#a = pd.DataFrame(rf_random.cv_results_).sort_values(by=['rank_test_score'])
#a

In [None]:
#Plot actual vs prediction
with plt.style.context('default'):
    plt.figure()
    plt.subplots(figsize=(8,4))
    plt.plot(return_conversion_scale(y_val), label = "Actual Conversion")
    plt.plot(return_conversion_scale(best_pred), label = "Prediction")
    plt.grid()
    plt.title('Random Forrest Regression Prediction (Best Parameter Tuning)')
    plt.xlabel('Time')
    plt.ylabel('Conversion (%)')
    plt.legend(loc='best')
    plt.show()

In [None]:
#Training Curves
from sklearn.model_selection import GridSearchCV
tree_grid = {'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 200, num = 39)]}

# Create the grid search model and fit to the training data
tree_grid_search = GridSearchCV(best_random, param_grid=tree_grid, verbose = 3, n_jobs=-1, cv = ts_split,
                                scoring = 'neg_mean_absolute_error')
tree_grid_search.fit(X_train, y_train)

In [None]:
tree_grid_search.best_params_

In [None]:
train_scores = pd.DataFrame(rf_random.cv_results_)
train_scores.head()

In [None]:
def plot_results(model, param = 'n_estimators', name = 'Num Trees'):
    param_name = 'param_%s' % param

    # Extract information from the cross validation model
    #train_scores = model.cv_results_['mean_train_score']
    test_scores = model.cv_results_['mean_test_score']
    train_time = model.cv_results_['mean_fit_time']
    param_values = list(model.cv_results_[param_name])
    
    # Plot the scores over the parameter
    plt.subplots(1, 2, figsize=(10, 6))
    plt.subplot(121)
    #plt.plot(param_values, train_scores, 'bo-', label = 'train')
    plt.plot(param_values, test_scores, 'go-', label = 'test')
    plt.ylim(ymin = -0.2, ymax = 0)
    plt.legend()
    plt.grid()
    plt.xlabel(name)
    plt.ylabel('Neg Mean Absolute Error')
    plt.title('Score vs %s' % name)
    
    plt.subplot(122)
    plt.plot(param_values, train_time, 'ro-')
    plt.ylim(ymin = 0.0, ymax = 10.0)
    plt.grid()
    plt.xlabel(name)
    plt.ylabel('Train Time (sec)')
    plt.title('Training Time vs %s' % name)
    
    plt.tight_layout(pad = 4)

In [None]:
plot_results(tree_grid_search)

In [None]:
RFR_final = rf_random
RFR_predict = RFR_final.predict(X_test)

In [None]:
#measure model performance
model_metrics(y_test, RFR_predict)

In [None]:
#Plot actual vs prediction
with plt.style.context('default'):
    plt.figure()
    plt.subplots(figsize=(8,4))
    plt.plot(return_conversion_scale(y_test), label = "Actual Conversion")
    plt.plot(return_conversion_scale(RFR_predict), label = "Prediction")
    plt.grid()
    plt.title('Random Forrest Regression Prediction (Best Parameter Tuning)')
    plt.xlabel('Time')
    plt.ylabel('Conversion (%)')
    plt.legend(loc='best')
    plt.show()

# MACHINE LEARNING MODEL : SUPPORT VECTOR REGRESSION

In [None]:
from sklearn.svm import SVR
svr_rbf = SVR()
svr_rbf.fit(X_train, y_train)

In [None]:
svr_pred = svr_rbf.predict(X_val)
svr_pred

In [None]:
#Evaluating SVR performance
model_metrics(y_val, svr_pred)

In [None]:
#Plot actual vs prediction
with plt.style.context('default'):
    plt.figure()
    plt.subplots(figsize=(8,4))
    plt.plot(return_conversion_scale(y_val), label = "Actual Conversion")
    plt.plot(return_conversion_scale(svr_pred), label = "Prediction")
    plt.grid()
    plt.title('Support Vector Regression Prediction')
    plt.xlabel('Time')
    plt.ylabel('Conversion (%)')
    plt.legend(loc='best')
    plt.show()

In [None]:
#SVR polynomial kernel with 3 degree
#svr_poly3 = SVR(kernel='poly', gamma='auto', degree=3)
#SVR polynomial kernel with 4 degree
#svr_poly4 = SVR(kernel='poly', gamma='auto', degree=4)
#SVR polynomial kernel with 5 degree
#svr_poly5 = SVR(kernel='poly', gamma='auto', degree=5)

In [None]:
#svrs = [svr_rbf, svr_poly3, svr_poly4, svr_poly5]
#kernel_label = ['rbf', '3 degree Polynomial', '4 degree Polynomial', '5 degree Polynomial']
#fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(10, 10), sharey=True)
#for ix, svr in enumerate(svrs):
#    svr.fit(X_train, y_train)
#    svr_poly_pred = svr.predict(X_val)
#    print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, svr_poly_pred))
#    print('Mean Squared Error:', metrics.mean_squared_error(y_val, svr_poly_pred))
#    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, svr_poly_pred)))
#    print('Coefficient of Determination:', r2_score(y_val, svr_poly_pred)) 
#    with plt.style.context('ggplot'):
#        axes[ix].plot(y_val, label = "Actual Conversion")
#        axes[ix].plot(svr_poly_pred,
#                  label='{} model'.format(kernel_label[ix]))
#        axes[ix].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),
#                    ncol=1, fancybox=True, shadow=True)
#        axes[ix].legend(loc='best')
#fig.text(0.5, 0.04, 'Time', ha='center', va='center')
#fig.text(0.06, 0.5, 'Conversion', ha='center', va='center', rotation='vertical')
#fig.suptitle("Support Vector Regression Model", fontsize=14)
#plt.show()

In [None]:
#Set parameter for grid search : 1st round
kernel = ['poly', 'rbf', 'sigmoid']
degree = [2, 3, 4, 5]

# Create the random grid
param_grid = {'kernel': kernel,
               'degree': degree}

print(param_grid)

In [None]:
#Use the random grid to search for best hyperparameters
from sklearn.model_selection import GridSearchCV
# Create the grid search model and fit to the training data
svr_grid_first = GridSearchCV(SVR(), param_grid=param_grid, verbose = 3, n_jobs=-1, cv = ts_split,
                                scoring = 'neg_mean_absolute_error')
svr_grid_first.fit(X_train, y_train)

In [None]:
svr_grid_first.best_params_

In [None]:
best_grid_first = svr_grid_first.best_estimator_

In [None]:
#Grid search to find best hyperparameters : 2nd round
gamma = ['scale', 'auto']
C = [0.001, 0.01, 0.1, 1, 10]
epsilon = [0.0001, 0.001, 0.01, 0.1]

# Create the random grid
param_grid_final = {'gamma': gamma,
               'C': C,
               'epsilon': epsilon}

# Create the grid search model and fit to the training data
svr_grid_final = GridSearchCV(best_grid_first, param_grid=param_grid_final, verbose = 3, n_jobs=-1, cv = ts_split,
                                scoring = 'neg_mean_absolute_error')
svr_grid_final.fit(X_train, y_train)

In [None]:
svr_grid_final.best_params_

In [None]:
#Predict hyper parameter tuned SVR
best_svr_pred = svr_grid_final.predict(X_val)

In [None]:
#Evaluating hyper parameter tuned SVR performance

model_metrics(y_val, best_svr_pred)

In [None]:
#Plot actual vs prediction
with plt.style.context('default'):
    plt.figure()
    plt.subplots(figsize=(8,4))
    plt.plot(return_conversion_scale(y_val), label = "Actual Conversion")
    plt.plot(return_conversion_scale(best_svr_pred), label = "Prediction")
    plt.grid()
    plt.title('Support Vector Regression Prediction')
    plt.xlabel('Time')
    plt.ylabel('Conversion (%)')
    plt.legend(loc='best')
    plt.show()

In [None]:
#Testing the final model 
SVR_final = svr_grid_final
SVR_predict = SVR_final.predict(X_test)

In [None]:
#Evaluating final model SVR performance

model_metrics(y_test, SVR_predict)

In [None]:
#Plot actual vs prediction
with plt.style.context('default'):
    plt.figure()
    plt.subplots(figsize=(8,4))
    plt.plot(return_conversion_scale(y_test), label = "Actual Conversion")
    plt.plot(return_conversion_scale(SVR_predict), label = "Prediction")
    plt.grid()
    plt.title('Support Vector Regression Prediction')
    plt.xlabel('Time')
    plt.ylabel('Conversion (%)')
    plt.legend(loc='best')
    plt.show()

In [None]:
svr_rbf.get_params()

# RNN - LSTM

In [None]:
#Import RNN-LSTM library
import tensorflow as tf
from keras.wrappers.scikit_learn import KerasRegressor
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import GridSearchCV

In [None]:
#Reshape input
X_train_adj = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])) #reshape (adjust) train shape to 3 dimensional
X_val_adj  = X_val.reshape((X_val.shape[0], 1, X_val.shape[1])) #reshape (adjust) test shape to 3 dimensional
print(X_train_adj.shape, X_val_adj.shape, y_train.shape, y_val.shape )

In [None]:
#Build the model
model_lstm = tf.keras.Sequential()
model_lstm.add(tf.keras.layers.LSTM(75, return_sequences = True, input_shape = (X_train_adj.shape[1], X_train_adj.shape[2])))
model_lstm.add(tf.keras.layers.LSTM(units=30))
model_lstm.add(tf.keras.layers.Dropout(0.2))
model_lstm.add(tf.keras.layers.Dense(units=1))
model_lstm.compile(loss = 'mse', optimizer = 'adam')
model_lstm.summary()

In [None]:
#Fit the model
history_lstm = model_lstm.fit(X_train_adj, y_train, epochs = 100, batch_size=10, validation_data = (X_val_adj, y_val), shuffle=False)

In [None]:
# plot history
plt.plot(history_lstm.history['loss'], label='train')
plt.plot(history_lstm.history['val_loss'], label='test')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'])
plt.show()

In [None]:
lstm_first_pred = model_lstm(X_val_adj)

In [None]:
#Convert Tensor value to array
lstm_first_pred_a = lstm_first_pred.numpy()

In [None]:
model_metrics(y_val, lstm_first_pred_a)

In [None]:
#Plot actual vs prediction
with plt.style.context('default'):
    plt.figure()
    plt.subplots(figsize=(8,4))
    plt.plot(return_conversion_scale(y_val), label = "Actual Conversion")
    plt.plot(return_conversion_scale(lstm_first_pred), label = "Prediction")
    plt.grid()
    plt.title('LSTM')
    plt.xlabel('Time')
    plt.ylabel('Conversion (%)')
    plt.legend(loc='best')
    plt.show()

In [None]:
#Create LSTM function model
def create_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.LSTM(75, return_sequences = True, input_shape = (X_train_adj.shape[1], X_train_adj.shape[2])))
    model.add(tf.keras.layers.LSTM(units=30))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(units=1))
    
    model.compile(loss = 'mse', optimizer = 'adam', metrics=[tf.keras.metrics.MeanSquaredError()])
    model.summary()
    return model

In [None]:
model_tuning = KerasRegressor(build_fn=create_model, verbose=0)

In [None]:
# define the grid search parameters
batch_size = [32, 64, 128, 256]
epochs = [10, 20, 50, 100, 1000]

param_grid = dict(batch_size=batch_size, epochs=epochs)
lstm_grid = GridSearchCV(estimator=model_tuning, param_grid=param_grid, n_jobs=-1, cv=ts_split)
grid_result = lstm_grid.fit(X_train_adj, y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
pd.DataFrame(grid_result.cv_results_)

In [None]:
best_lstm = grid_result.best_estimator_
tune_predict = best_lstm.predict(X_val_adj)
model_metrics(y_val, tune_predict)