In [16]:
# explore the number of selected features for RFE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

import warnings
warnings.filterwarnings("ignore")

#------------------------------------------------------------------------------------------------------------------------------

def corfeatures(data):
    correlated_features = set()
    correlation_matrix = data.drop(['Profit'], axis = 1 ).corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i,j]) > 0.95:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)
    #print("Correlated Features :\n", correlated_features)
    newdata = data.drop(list(correlated_features), axis = 1)
    #print()
    #print("Shape of dataset after elimination correlated features: ", newdata.shape)
    #print()
    return newdata

def sol(l, X, y):
    rfecv = RFECV(estimator=l, step=1, cv=RepeatedKFold(10), scoring='r2')
    rfecv.fit(X, y)
    #print("Optimal No. of Features: ", rfecv.n_features_)
    f = rfecv.get_support(1) #the most important features
    #print(X.columns[f])
    #print()
    #print(rfecv.grid_scores_)
    return X

#------------------------------------------------------------------------------------------------------------------------------

def corfeatures1(data,test):
    correlated_features = set()
    correlation_matrix = data.drop(['Profit'], axis = 1 ).corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i,j]) > 0.95:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)
    #print("Correlated Features :\n", correlated_features)
    newdata = data.drop(list(correlated_features), axis = 1)
    newtest = test.drop(list(correlated_features), axis = 1)
    #print()
    #print("Shape of dataset after elimination correlated features: ", newdata.shape)
    #print()
    return newdata, newtest

In [None]:
#Split provided dataset into train & test dataset

data = pd.read_excel("ml_demand-2.xlsx")

newdata = corfeatures(data)

x = newdata.drop(['Profit'], axis = 1)
y = newdata['Profit'].copy()

reg = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]

for i in reg:
    print()
    print(i)
    x_new = sol(i, x, y)
    print()
    
    #Spliiting Data
    x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.25)
    
    d = i
    d.fit(x_train, y_train)
    predictions = d.predict(x_test)
    print(r2_score(y_test, predictions))
    print(mean_absolute_error(y_test, predictions))
    print(mean_squared_error(y_test, predictions))
    print("-------------------------------------------------------------------------------------------------------------")
    print()

    

In [None]:
#Seperate dataset for train & test

#Read train and test dataset
data=pd.read_excel("full.xlsx")
trnst = pd.read_excel("disruptns.xlsx")

#Removing correlated features
newdata, test = corfeatures1(data, trnst)

#define x & y for train
x = newdata.drop(['Profit'], axis = 1 )
y = newdata['Profit'].copy()

#define x & y for test
test_x = test.drop(['Profit'], axis = 1)
test_y = test['Profit'].copy()

reg = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]

for i in reg:
    print()
    print(i)
    x_new = sol(i, x, y)
    print()
    
    d = i
    d.fit(x_new, y)
    predictions = d.predict(test_x)
    print(r2_score(test_y, predictions))
    print(mean_absolute_error(test_y, predictions))
    print(mean_squared_error(test_y, predictions))
    print("-------------------------------------------------------------------------------------------------------------")
    print()

In [None]:
# Deep Neural Network Regression

#Reading data
data=pd.read_excel("full.xlsx")
trnst = pd.read_excel("disruptns.xlsx")

#Removing correlated features
newdata, test = corfeatures1(data, trnst)

#define x & y for train
x = newdata.drop(['Profit'], axis = 1 )
y = newdata['Profit'].copy()

#define x & y for test
test_x = test.drop(['Profit'], axis = 1)
test_y = test['Profit'].copy()

NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = x.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

#Fitting the model
NN_model.fit(x, y, epochs=500, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

predictions = NN_model.predict(test_x)
print(r2_score(test_y, predictions))
print(mean_absolute_error(test_y, predictions))

In [20]:
#Print predictions

#Read train and test dataset
data=pd.read_excel("full.xlsx")
trnst = pd.read_excel("progrm.xlsx")

#Removing correlated features
newdata, test = corfeatures1(data, trnst)

#define x & y for train
x = newdata.drop(['Profit'], axis = 1 )
y = newdata['Profit'].copy()

#define x & y for test
test_x = test
#test_y = test['Profit'].copy()


reg = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]

for i in reg:
    print()
    print(i)
    x_new = sol(i, x, y)
    print()
    
    d = i
    d.fit(x_new, y)
    predictions = d.predict(test_x)
    print(predictions)
    #print(r2_score(test_y, predictions))
    #print(mean_absolute_error(test_y, predictions))
    #print(mean_squared_error(test_y, predictions))
    print("-------------------------------------------------------------------------------------------------------------")
    print()
    




LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

[-24861.17902099  47059.82521258]
-------------------------------------------------------------------------------------------------------------


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

[-22510.          49727.00319645]
-------------------------------------------------------------------------------------------------------------


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split