## <span id="2"></span> ** Importing Libraries and Reading the Dataset **

In [1]:
import timeit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import cross_val_score
from collections import Counter
from IPython.core.display import display, HTML
sns.set_style('darkgrid')
from datetime import datetime

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
drivePath = "drive/MyDrive/Implementation/Testing_Diff_Ratios"

In [7]:
filepath = 'drive/MyDrive/Implementation/Prepared_dataset.csv'
dataset = pd.read_csv(filepath,  index_col = 0)


### <span id="4"></span> ** Data Preprocessing **

Are there missing values? There isn't any missing values as shown below.

In [8]:
X = dataset.drop(["consumption_unit_total"], axis=1)
y = dataset[["consumption_unit_total"]].values.reshape(-1,1)


In [9]:
X

Unnamed: 0,community,temperature,NumOfConnected-COMMELEC,NumOfConnected-FREENRESIE,NumOfConnected-GOVTELEC,NumOfConnected-INDTELEC,NumOfConnected-RESIEXPE,NumOfConnected-TOTAL,Expat_Ratio,Multi.Storey.Building,Multi.Storey.Ratio.Building,Investment.Villa,Public.Building,Industrial.Building,Total.Buildings,population,population_density,month_int
208,112,33.7,3027,0,22,0,1060,4119,0.990654,204.0,0.0,0.0,185.0,0.0,401.0,6621.0,16.511222,1.500000
209,113,33.7,3191,0,4,0,1518,4719,0.996063,307.0,0.0,0.0,55.0,0.0,383.0,14963.0,39.067885,1.500000
210,114,33.7,3348,0,4,0,597,3950,0.998328,174.0,0.0,0.0,157.0,0.0,332.0,2563.0,7.719880,1.500000
211,115,33.7,1326,0,8,0,294,1628,1.000000,73.0,0.0,0.0,33.0,0.0,106.0,3750.0,35.377358,1.500000
212,116,33.7,2799,0,9,0,2422,5240,0.996298,250.0,0.0,0.0,20.0,0.0,281.0,18512.0,65.879004,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6953,945,23.6,1,1,1,0,0,11,0.000000,1.0,0.0,0.0,5.0,0.0,48.0,417.0,8.687500,0.133975
6954,951,23.6,15,3,2,0,3,35,0.200000,1.0,0.0,0.0,3.0,0.0,44.0,607.0,13.795455,0.133975
6955,961,23.6,31,2,11,0,3,99,0.054545,2.0,0.0,13.0,5.0,0.0,92.0,665.0,7.228261,0.133975
6956,967,23.6,7,0,1,0,0,12,0.000000,0.0,0.0,0.0,5.0,0.0,14.0,7.0,0.500000,0.133975


In [10]:
# Splitting the dataset into the Training set and Test set
## Here we have two ways, Either Go RANDOM or SEQUENTIAL SPLIT
from sklearn.model_selection import train_test_split
split_method="random"
if (split_method=="random"):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)
else:
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, shuffle=False)

In [11]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)

Shape of X_train:  (3955, 18)
Shape of X_test:  (1695, 18)
Shape of y_train:  (3955, 1)
Shape of y_test (1695, 1)


### <span id="15"></span> ** Varying Train Test Ratio **

In [None]:
all_results=pd.DataFrame()

In [None]:
all_results


In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
    # Fitting the Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor

In [None]:
# Fitting the Random Forest Regression to the dataset

for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
    
    model_xgb = xgb.XGBRegressor()

    start = timeit.default_timer() ##
    model_xgb.fit(X_train, y_train)
    stop = timeit.default_timer() ##
    time_xgb=(stop - start)*1000 ##


    # Predicting Cross Validation Score the Test set results
    cv_xgb = cross_val_score(estimator = model_xgb, X = X_train, y = y_train, cv = 10)

    # Predicting R2 Score the Train set results
    y_pred_xgb_train = model_xgb.predict(X_train)
    r2_score_xgb_train = r2_score(y_train, y_pred_xgb_train)

    # Predicting R2 Score the Test set results
    y_pred_xgb_test = model_xgb.predict(X_test)
    r2_score_xgb_test = r2_score(y_test, y_pred_xgb_test)

    # Predicting RMSE the Test set results
    rmse_xgb =  (np.sqrt(mean_squared_error(y_test,y_pred_xgb_test)))
    mae_xgb = mean_absolute_error(y_test,y_pred_xgb_test)
    med_ae_xgb = median_absolute_error(y_test,y_pred_xgb_test)
    


    print('CV: ', cv_xgb.mean())
    print('R2_score (train): ', r2_score_xgb_train)
    print('R2_score (test): ', r2_score_xgb_test)
    print("RMSE: ", rmse_xgb)

    mae_xgb = mean_absolute_error(y_test, y_pred_xgb_test)
    med_ae_xgb = median_absolute_error(y_test, y_pred_xgb_test)
    print("MAE: ", mae_xgb)
    print("MAE: ", med_ae_xgb)
    one_results=pd.DataFrame([["XGB", test_size  , rmse_xgb, r2_score_xgb_train,
                               r2_score_xgb_test, cv_xgb.mean(),mae_xgb, med_ae_xgb, time_xgb]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation', 'Mean Absolute Error (Normalized)', 'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])


CV:  0.9487350486154362
R2_score (train):  0.9972211074523198
R2_score (test):  0.9639301806122419
RMSE:  0.030380038198565735
MAE:  0.015061005024854175
MAE:  0.006694710241935655
CV:  0.944799355106986
R2_score (train):  0.9973300985123977
R2_score (test):  0.9646931835947838
RMSE:  0.031230519209569752
MAE:  0.016065375666082047
MAE:  0.007181402402678276
CV:  0.9360636725652259
R2_score (train):  0.99805603201046
R2_score (test):  0.9587048382220679
RMSE:  0.03464366210995074
MAE:  0.0172404136751806
MAE:  0.007114539798326935
CV:  0.9398776501304191
R2_score (train):  0.9985373793217692
R2_score (test):  0.9378576700896972
RMSE:  0.042148077372700016
MAE:  0.018175505611400678
MAE:  0.007310974539258459
CV:  0.9462225755955341
R2_score (train):  0.9987502136130872
R2_score (test):  0.9364067171693886
RMSE:  0.0430671272162261
MAE:  0.018592768581151543
MAE:  0.007632485702648134
CV:  0.9382585981187164
R2_score (train):  0.9991232146805291
R2_score (test):  0.9342316174780655
RMSE

In [None]:
# Fitting the Random Forest Regression to the dataset

for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
    estimators = [('Randomforest',  RandomForestRegressor(n_estimators = 75, random_state = 0)),
                  ('ANN', MLPRegressor(alpha=0.05, random_state=1, max_iter=50, verbose=False, 
                                       hidden_layer_sizes=(100,50,50), n_iter_no_change=20)),
                  ('lr', LinearRegression())]

    final_estimator = GradientBoostingRegressor(
        n_estimators=25, subsample=0.5, min_samples_leaf=10, 
        random_state=42)
    model_stk = StackingRegressor(
        estimators=estimators,
        final_estimator=final_estimator)

    
    start = timeit.default_timer() ##
    model_stk.fit(X_train, y_train.ravel())
    stop = timeit.default_timer() ##
    time_rf=(stop - start)*1000 ##

    from sklearn.metrics import r2_score

    # Predicting Cross Validation Score
    cv_rf = cross_val_score(estimator = model_stk, X = X_train, y = y_train.ravel(), cv = 10)

    # Predicting R2 Score the Train set results
    y_pred_rf_train = model_stk.predict(X_train)
    r2_score_rf_train = r2_score(y_train, y_pred_rf_train)

    # Predicting R2 Score the Test set results
    y_pred_rf_test = model_stk.predict(X_test)
    r2_score_rf_test = r2_score(y_test, y_pred_rf_test)

    # Predicting RMSE the Test set results
    rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))
    print('CV: ', cv_rf.mean())
    print('R2_score (train): ', r2_score_rf_train)
    print('R2_score (test): ', r2_score_rf_test)
    print("RMSE: ", rmse_rf)

    mae_rf = mean_absolute_error(y_test, y_pred_rf_test)
    med_ae_rf = median_absolute_error(y_test, y_pred_rf_test)
    print("MAE: ", mae_rf)
    print("MAE: ", med_ae_rf)
    one_results=pd.DataFrame([["STK", test_size  , rmse_rf, r2_score_rf_train,
                               r2_score_rf_test, cv_rf.mean(),mae_rf, med_ae_rf, time_rf]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation', 'Mean Absolute Error (Normalized)', 'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])






CV:  0.9295311264230284
R2_score (train):  0.9759456879883864
R2_score (test):  0.9567339350573945
RMSE:  0.033272852185143066
MAE:  0.01938905097316027
MAE:  0.011100444403948525






CV:  0.9280848921097086
R2_score (train):  0.9735468628522918
R2_score (test):  0.952530533816599
RMSE:  0.03621238247927812
MAE:  0.02072428332797743
MAE:  0.01148938510989637






CV:  0.9253979761853645
R2_score (train):  0.9754279305632012
R2_score (test):  0.94462162301132
RMSE:  0.04011846611882886
MAE:  0.02189164145058499
MAE:  0.011647604901353414






CV:  0.9247627620973777
R2_score (train):  0.973925933883074
R2_score (test):  0.9230650712952426
RMSE:  0.04689707758936859
MAE:  0.02349265425742808
MAE:  0.01175951458805203






CV:  0.9255807129985453
R2_score (train):  0.9723394291112369
R2_score (test):  0.9262557594925247
RMSE:  0.046377179666962885
MAE:  0.02405170156816506
MAE:  0.011784304073908361






CV:  0.9218773942656536
R2_score (train):  0.9776193428407084
R2_score (test):  0.9210669571991656
RMSE:  0.047559165617444076
MAE:  0.025233765839383908
MAE:  0.012661645497038874






CV:  0.9211553430486663
R2_score (train):  0.9762720624305588
R2_score (test):  0.9123431120479607
RMSE:  0.0499977367717133
MAE:  0.02693085489233003
MAE:  0.013407027303753705




In [None]:
all_results

Unnamed: 0,Model,Test Data Ratio,RMSE,R2_Score(training),R2_Score(test),Cross-Validation,Mean Absolute Error (Normalized),Median Absolute Error (Normalized),Time Taken (ms)
0,XGB,10,0.03038,0.997221,0.96393,0.948735,0.015061,0.006695,529.195694
0,XGB,20,0.031231,0.99733,0.964693,0.944799,0.016065,0.007181,383.65984
0,XGB,30,0.034644,0.998056,0.958705,0.936064,0.01724,0.007115,448.246866
0,XGB,40,0.042148,0.998537,0.937858,0.939878,0.018176,0.007311,611.542809
0,XGB,50,0.043067,0.99875,0.936407,0.946223,0.018593,0.007632,274.917241
0,XGB,60,0.043412,0.999123,0.934232,0.938259,0.019629,0.00768,214.626284
0,XGB,70,0.045846,0.999498,0.926295,0.932086,0.021632,0.008849,183.478892
0,STK,10,0.033273,0.975946,0.956734,0.929531,0.019389,0.0111,21859.706185
0,STK,20,0.036212,0.973547,0.952531,0.928085,0.020724,0.011489,20201.359648
0,STK,30,0.040118,0.975428,0.944622,0.925398,0.021892,0.011648,19320.7775


In [None]:
# Fitting the Random Forest Regression to the dataset

for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
    
  
        
    
    
    model_lgb = LGBMRegressor()
    start = timeit.default_timer() ##
    model_lgb.fit(X_train, y_train.ravel())
    stop = timeit.default_timer() ##
    time_rf=(stop - start)*1000 ##

    from sklearn.metrics import r2_score

    # Predicting Cross Validation Score
    cv_rf = cross_val_score(estimator = model_lgb, X = X_train, y = y_train.ravel(), cv = 10)

    # Predicting R2 Score the Train set results
    y_pred_rf_train = model_lgb.predict(X_train)
    r2_score_rf_train = r2_score(y_train, y_pred_rf_train)

    # Predicting R2 Score the Test set results
    y_pred_rf_test = model_lgb.predict(X_test)
    r2_score_rf_test = r2_score(y_test, y_pred_rf_test)

    # Predicting RMSE the Test set results
    rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))
    print('CV: ', cv_rf.mean())
    print('R2_score (train): ', r2_score_rf_train)
    print('R2_score (test): ', r2_score_rf_test)
    print("RMSE: ", rmse_rf)

    mae_rf = mean_absolute_error(y_test, y_pred_rf_test)
    med_ae_rf = median_absolute_error(y_test, y_pred_rf_test)
    print("MAE: ", mae_rf)
    print("MAE: ", med_ae_rf)
    one_results=pd.DataFrame([["LightGB", test_size  , rmse_rf, r2_score_rf_train,
                               r2_score_rf_test, cv_rf.mean(),mae_rf, med_ae_rf, time_rf]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation', 'Mean Absolute Error (Normalized)', 'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])


CV:  0.9544067260679976
R2_score (train):  0.981779148032281
R2_score (test):  0.973145992996695
RMSE:  0.026213250676530228
MAE:  0.01411836517314859
MAE:  0.0074587922998767126
CV:  0.9511874170153038
R2_score (train):  0.9814346333717342
R2_score (test):  0.9673206260469553
RMSE:  0.03004600871416539
MAE:  0.01612486295566913
MAE:  0.007784055024223288
CV:  0.9450954340169613
R2_score (train):  0.9810108866128865
R2_score (test):  0.961649296752014
RMSE:  0.03338572980763741
MAE:  0.01707218490546305
MAE:  0.008005642103249042
CV:  0.9485323581751572
R2_score (train):  0.9836206483951083
R2_score (test):  0.9351353638712832
RMSE:  0.04306138514657677
MAE:  0.018559626795427702
MAE:  0.008036582866908906
CV:  0.9480619271583605
R2_score (train):  0.984929979676911
R2_score (test):  0.9392785718898249
RMSE:  0.042083443539975625
MAE:  0.019448152644860675
MAE:  0.008487404933109469
CV:  0.9389875801206224
R2_score (train):  0.9841587762278698
R2_score (test):  0.9335667275677864
RMSE:

In [None]:
# Fitting the Random Forest Regression to the dataset

for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
    from sklearn.ensemble import RandomForestRegressor
    
    model_gbm= GradientBoostingRegressor(learning_rate=0.1,random_state=1)
    start = timeit.default_timer() ##
    model_gbm.fit(X_train, y_train.ravel())
    stop = timeit.default_timer() ##
    time_rf=(stop - start)*1000 ##

    from sklearn.metrics import r2_score

    # Predicting Cross Validation Score
    cv_rf = cross_val_score(estimator = model_gbm, X = X_train, y = y_train.ravel(), cv = 10)

    # Predicting R2 Score the Train set results
    y_pred_rf_train = model_gbm.predict(X_train)
    r2_score_rf_train = r2_score(y_train, y_pred_rf_train)

    # Predicting R2 Score the Test set results
    y_pred_rf_test = model_gbm.predict(X_test)
    r2_score_rf_test = r2_score(y_test, y_pred_rf_test)

    # Predicting RMSE the Test set results
    rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))
    print('CV: ', cv_rf.mean())
    print('R2_score (train): ', r2_score_rf_train)
    print('R2_score (test): ', r2_score_rf_test)
    print("RMSE: ", rmse_rf)

    mae_rf = mean_absolute_error(y_test, y_pred_rf_test)
    med_ae_rf = median_absolute_error(y_test, y_pred_rf_test)
    print("MAE: ", mae_rf)
    print("MAE: ", med_ae_rf)
    one_results=pd.DataFrame([["GBM", test_size  , rmse_rf, r2_score_rf_train,
                               r2_score_rf_test, cv_rf.mean(),mae_rf, med_ae_rf, time_rf]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation', 'Mean Absolute Error (Normalized)', 'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])


CV:  0.9248951440725165
R2_score (train):  0.9457984734398032
R2_score (test):  0.9398488768876022
RMSE:  0.03923179707100925
MAE:  0.025250949683376815
MAE:  0.014821005877573165
CV:  0.9249052184268972
R2_score (train):  0.9480327246530157
R2_score (test):  0.9371678428790268
RMSE:  0.04166207628677024
MAE:  0.025714070465183227
MAE:  0.014702077116104835
CV:  0.9200367745036747
R2_score (train):  0.947381503800563
R2_score (test):  0.9339276178726311
RMSE:  0.04382119238509772
MAE:  0.02599614881273323
MAE:  0.014406079153813145
CV:  0.9221939897593409
R2_score (train):  0.9511881620038639
R2_score (test):  0.9202099365097568
RMSE:  0.047759350039072584
MAE:  0.02646326813837818
MAE:  0.014754049536858174
CV:  0.9231434790795771
R2_score (train):  0.9540666551654843
R2_score (test):  0.9230215408806595
RMSE:  0.047383254588473506
MAE:  0.02719523720043386
MAE:  0.015437419681643083
CV:  0.9202996042816756
R2_score (train):  0.9583304159834737
R2_score (test):  0.9235196157125362
RMS

In [None]:
# Fitting the Random Forest Regression to the dataset

for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
    from sklearn.ensemble import RandomForestRegressor
    regressor_rf = RandomForestRegressor(n_estimators = 200, random_state = 0)
    start = timeit.default_timer() ##
    regressor_rf.fit(X_train, y_train.ravel())
    stop = timeit.default_timer() ##
    time_rf=(stop - start)*1000 ##

    from sklearn.metrics import r2_score

    # Predicting Cross Validation Score
    cv_rf = cross_val_score(estimator = regressor_rf, X = X_train, y = y_train.ravel(), cv = 10)

    # Predicting R2 Score the Train set results
    y_pred_rf_train = regressor_rf.predict(X_train)
    r2_score_rf_train = r2_score(y_train, y_pred_rf_train)

    # Predicting R2 Score the Test set results
    y_pred_rf_test = regressor_rf.predict(X_test)
    r2_score_rf_test = r2_score(y_test, y_pred_rf_test)

    # Predicting RMSE the Test set results
    rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))
    print('CV: ', cv_rf.mean())
    print('R2_score (train): ', r2_score_rf_train)
    print('R2_score (test): ', r2_score_rf_test)
    print("RMSE: ", rmse_rf)

    mae_rf = mean_absolute_error(y_test, y_pred_rf_test)
    med_ae_rf = median_absolute_error(y_test, y_pred_rf_test)
    print("MAE: ", mae_rf)
    print("MAE: ", med_ae_rf)
    one_results=pd.DataFrame([["Random Forest", test_size  , rmse_rf, r2_score_rf_train,
                               r2_score_rf_test, cv_rf.mean(),mae_rf, med_ae_rf, time_rf]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation', 'Mean Absolute Error (Normalized)', 'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])


In [None]:
for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)

    
    
    start = timeit.default_timer() ##
    regressor_linear.fit(X_train, y_train)
    stop = timeit.default_timer() ##
    time_linear=(stop - start)*1000 ##
    
    # Predicting R2 Score the Train set results
    y_pred_linear_train = regressor_linear.predict(X_train)
    r2_score_linear_train = r2_score(y_train, y_pred_linear_train)

    # Predicting R2 Score the Test set results
    y_pred_linear_test = regressor_linear.predict(X_test)
    r2_score_linear_test = r2_score(y_test, y_pred_linear_test)

    # Predicting RMSE the Test set results
    rmse_linear =  (np.sqrt(mean_squared_error(y_test,y_pred_linear_test)))
    r2_score_linear_train = r2_score(y_train, y_pred_linear_train)
    r2_score_linear_test = r2_score(y_test, y_pred_linear_test)

    cv_linear = cross_val_score(estimator = regressor_linear, X = X_train, y = y_train.ravel(), cv = 10).mean()
    mae_linear = mean_absolute_error(y_test, y_pred_linear_test)
    med_ae_linear = median_absolute_error(y_test, y_pred_linear_test) 
    
    one_results=pd.DataFrame([["Linear Model", test_size  , rmse_linear, r2_score_linear_train, 
                               r2_score_linear_test, cv_linear.mean(),mae_linear, med_ae_linear, 
                               time_linear]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)',
                                      'R2_Score(test)', 'Cross-Validation', 
                                      'Mean Absolute Error (Normalized)', 
                                      'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])   
    
    print(one_results)

In [None]:
for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)

    regressor_svr = SVR(kernel = 'rbf',degree=3,C=1.5, epsilon=0.1, gamma=1e-7)
    start = timeit.default_timer() ##
    regressor_svr.fit(X_train, y_train.ravel())
    stop = timeit.default_timer() ##
    time_svr=(stop - start)*1000 ##    
        # Predicting Cross Validation Score
    

    # Predicting R2 Score the Train set results
    y_pred_svr_train = (regressor_svr.predict((X_train)))
    r2_score_svr_train = r2_score(y_train, y_pred_svr_train)

    # Predicting R2 Score the Test set results
    y_pred_svr_test = (regressor_svr.predict((X_test)))
    r2_score_svr_test = r2_score(y_test, y_pred_svr_test)


    # Predicting RMSE the Test set results
    rmse_svr =  (np.sqrt(mean_squared_error(y_test,y_pred_svr_test)))
    r2_score_svr_train = r2_score(y_train, y_pred_svr_train)
    r2_score_svr_test = r2_score(y_test, y_pred_svr_test)

    cv_svr = cross_val_score(estimator = regressor_svr, X = X_train, y = y_train.ravel(), cv = 10).mean()
    mae_svr = mean_absolute_error(y_test, y_pred_svr_test)
    med_ae_svr = median_absolute_error(y_test, y_pred_svr_test)

    
    one_results=pd.DataFrame([["SVR Model", test_size  , rmse_svr, r2_score_svr_train, 
                               r2_score_svr_test, cv_svr.mean(),mae_svr, med_ae_svr, time_svr]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)',
                                      'R2_Score(test)', 'Cross-Validation', 
                                      'Mean Absolute Error (Normalized)', 
                                      'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])   
    
    print(one_results)

       Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  SVR Model               10  0.079011            0.803798        0.756026   

   Cross-Validation  Mean Absolute Error (Normalized)  \
0          0.771544                          0.061979   

   Median Absolute Error (Normalized)  Time Taken (ms)  
0                            0.054895        509.37891  
       Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  SVR Model               20  0.080167            0.804955        0.767359   

   Cross-Validation  Mean Absolute Error (Normalized)  \
0          0.769713                          0.062981   

   Median Absolute Error (Normalized)  Time Taken (ms)  
0                            0.056756       389.278114  
       Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  SVR Model               30  0.081224            0.800201        0.773003   

   Cross-Validation  Mean Absolute Error (Normalized)  \
0       

In [None]:
for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
     
    steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=3.8, fit_intercept=True))
    ]
    ridge_pipe = Pipeline(steps)
    start = timeit.default_timer() ##
    ridge_pipe.fit(X_train, y_train)
    stop = timeit.default_timer() ##
    time_ridge=(stop - start)*1000 ##

    
    # Predicting R2 Score the Test set results
    y_pred_ridge_train = ridge_pipe.predict(X_train)
    r2_score_ridge_train = r2_score(y_train, y_pred_ridge_train)

    # Predicting R2 Score the Test set results
    y_pred_ridge_test = ridge_pipe.predict(X_test)
    r2_score_ridge_test = r2_score(y_test, y_pred_ridge_test)

    # Predicting RMSE the Test set results
    rmse_ridge =  (np.sqrt(mean_squared_error(y_test,y_pred_ridge_test)))
    r2_score_ridge_train = r2_score(y_train, y_pred_ridge_train)
    r2_score_ridge_test = r2_score(y_test, y_pred_ridge_test)

    cv_ridge = cross_val_score(estimator = ridge_pipe, X = X_train, y = y_train.ravel(), cv = 10).mean()
    mae_ridge = mean_absolute_error(y_test, y_pred_ridge_test)
    med_ae_ridge = median_absolute_error(y_test, y_pred_ridge_test)    
    
    one_results=pd.DataFrame([["Ridge Model", test_size  , rmse_ridge, r2_score_ridge_train, 
                               r2_score_ridge_test, cv_ridge.mean(),mae_ridge, med_ae_ridge, time_ridge]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)',
                                      'R2_Score(test)', 'Cross-Validation', 
                                      'Mean Absolute Error (Normalized)', 
                                      'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])   
    
    print(one_results)

         Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  Ridge Model               10  0.079236            0.905914        0.754637   

   Cross-Validation  Mean Absolute Error (Normalized)  \
0          0.893072                          0.035011   

   Median Absolute Error (Normalized)  Time Taken (ms)  
0                            0.021395        46.663812  
         Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  Ridge Model               20  0.067185             0.90508        0.836601   

   Cross-Validation  Mean Absolute Error (Normalized)  \
0          0.888929                          0.034371   

   Median Absolute Error (Normalized)  Time Taken (ms)  
0                            0.021889        33.470561  
         Model  Test Data Ratio     RMSE  R2_Score(training)  R2_Score(test)  \
0  Ridge Model               30  0.06251            0.901439        0.865554   

   Cross-Validation  Mean Absolute Error (Normalized)  

In [None]:
for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)

    steps = [
        ('scalar', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2)),
        ('model', Lasso(alpha=0.012, fit_intercept=True, max_iter=3000))
    ]

    lasso_pipe = Pipeline(steps)
    start = timeit.default_timer() ##
    lasso_pipe.fit(X_train, y_train)
    stop = timeit.default_timer() ##
    time_lasso=(stop - start)*1000 ##
    
    

    # Predicting R2 Score the Test set results
    y_pred_lasso_train = lasso_pipe.predict(X_train)
    r2_score_lasso_train = r2_score(y_train, y_pred_lasso_train)

    # Predicting R2 Score the Test set results
    y_pred_lasso_test = lasso_pipe.predict(X_test)
    r2_score_lasso_test = r2_score(y_test, y_pred_lasso_test)

    # Predicting RMSE the Test set results
    rmse_lasso =  (np.sqrt(mean_squared_error(y_test,y_pred_lasso_test)))
    r2_score_lasso_train = r2_score(y_train, y_pred_lasso_train)
    r2_score_lasso_test = r2_score(y_test, y_pred_lasso_test)
    

    # Predicting Cross Validation Score

    cv_lasso = cross_val_score(estimator = lasso_pipe, X = X_train, y = y_train.ravel(), cv = 10).mean()
    mae_lasso = mean_absolute_error(y_test, y_pred_lasso_test)
    med_ae_lasso = median_absolute_error(y_test, y_pred_lasso_test)
    

    
    one_results=pd.DataFrame([["lasso Model", test_size  , rmse_lasso, r2_score_lasso_train, 
                               r2_score_lasso_test, cv_lasso.mean(),mae_lasso, med_ae_lasso, time_lasso]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)',
                                      'R2_Score(test)', 'Cross-Validation', 
                                      'Mean Absolute Error (Normalized)', 
                                      'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])    
    
    print(one_results)

         Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  lasso Model               10  0.082981            0.737124        0.730895   

   Cross-Validation  Mean Absolute Error (Normalized)  \
0          0.730314                          0.053927   

   Median Absolute Error (Normalized)  Time Taken (ms)  
0                            0.037008       111.329247  
         Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  lasso Model               20  0.083197            0.735487         0.74944   

   Cross-Validation  Mean Absolute Error (Normalized)  \
0          0.727675                          0.054179   

   Median Absolute Error (Normalized)  Time Taken (ms)  
0                            0.036892       106.266442  
         Model  Test Data Ratio      RMSE  R2_Score(training)  R2_Score(test)  \
0  lasso Model               30  0.085271            0.732446        0.749821   

   Cross-Validation  Mean Absolute Error (Normalized)

In [None]:
# Fitting the Neural Network to the dataset

for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    sc_X.fit(X_train)
    #sc_y.fit(y_train)
    X_scaled_train = sc_X.transform(X_train)
    X_scaled_test = sc_X.transform(X_test)
    #y_scaled = sc_y.transform(y_train).reshape(-1,1)


    # Fitting the Random Forest Regression to the dataset

    from sklearn.neural_network import MLPRegressor
    regressor_nn =MLPRegressor(alpha=0.0005, random_state=1, max_iter=2000, verbose=False, hidden_layer_sizes=(100,50,50), n_iter_no_change=20)
    start = timeit.default_timer() ##
    regressor_nn.fit(X_scaled_train, y_train.ravel())
    stop = timeit.default_timer() ##
    time_nn=(stop - start)*1000 ##

    from sklearn.metrics import r2_score


    # Predicting Cross Validation Score
    cv_nn = cross_val_score(estimator = regressor_nn, X = X_scaled_train, y = y_train.ravel(), cv = 10)

    # Predicting R2 Score the Train set results
    y_pred_nn_train = regressor_nn.predict(X_scaled_train)
    r2_score_nn_train = r2_score(y_train, y_pred_nn_train)

    # Predicting R2 Score the Test set results
    y_pred_nn_test = regressor_nn.predict(X_scaled_test)
    r2_score_nn_test = r2_score(y_test, y_pred_nn_test)

    # Predicting RMSE the Test set results
    rmse_nn = (np.sqrt(mean_squared_error(y_test, y_pred_nn_test)))
    print('CV: ', cv_nn.mean())
    print('R2_score (train): ', r2_score_nn_train)
    print('R2_score (test): ', r2_score_nn_test)
    print("RMSE: ", rmse_nn)

    mae_nn = mean_absolute_error(y_test, y_pred_nn_test)
    med_ae_nn = median_absolute_error(y_test, y_pred_nn_test)
    print("MAE: ", mae_nn)
    print("MAE: ", med_ae_nn)
    one_results=pd.DataFrame([["Artificial Neural Network", test_size  , rmse_nn, r2_score_nn_train, 
                               r2_score_nn_test, cv_nn.mean(),mae_nn, med_ae_nn, time_nn]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation', 'Mean Absolute Error (Normalized)', 'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])

CV:  0.9325237329686106
R2_score (train):  0.953414652458275
R2_score (test):  0.954106636573459
RMSE:  0.034268198620835626
MAE:  0.022302880392100446
MAE:  0.01417830997792112
CV:  0.9291980316089645
R2_score (train):  0.9534780235463999
R2_score (test):  0.9511631966018566
RMSE:  0.036730220846391284
MAE:  0.02298284960117645
MAE:  0.014421902922843614
CV:  0.9241053491228357
R2_score (train):  0.9499444680370437
R2_score (test):  0.9487928965576988
RMSE:  0.03857796488413134
MAE:  0.024417641611474156
MAE:  0.015015980958409766
CV:  0.9224320684086889
R2_score (train):  0.953553708992961
R2_score (test):  0.9218569361127581
RMSE:  0.04726386358544949
MAE:  0.026723333186666115
MAE:  0.016169874263773733
CV:  0.9221154312416244
R2_score (train):  0.9579415685756624
R2_score (test):  0.9156998022471823
RMSE:  0.04958548637298742
MAE:  0.027839364309140687
MAE:  0.015928983764680604
CV:  0.9165462156873266
R2_score (train):  0.9456099984172435
R2_score (test):  0.9045195774381283
RMSE

In [None]:
# Fitting the Decision Tree to the dataset

for test_size in [10,20,30,40,50,60,70]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size/100.0, random_state = 25)
    from sklearn.tree import DecisionTreeRegressor
    regressor_dt = DecisionTreeRegressor(random_state = 0)
    start = timeit.default_timer() ##
    regressor_dt.fit(X_train, y_train)
    stop = timeit.default_timer() ##
    time_dt=(stop - start)*1000 ##

    from sklearn.metrics import r2_score

   
    # Predicting Cross Validation Score
    cv_dt = cross_val_score(estimator = regressor_dt, X = X_train, y = y_train, cv = 10)

    # Predicting R2 Score the Train set results
    y_pred_dt_train = regressor_dt.predict(X_train)
    r2_score_dt_train = r2_score(y_train, y_pred_dt_train)

    # Predicting R2 Score the Test set results
    y_pred_dt_test = regressor_dt.predict(X_test)
    r2_score_dt_test = r2_score(y_test, y_pred_dt_test)

    # Predicting RMSE the Test set results
    rmse_dt =  (np.sqrt(mean_squared_error(y_test,y_pred_dt_test)))

    print("CV: ", cv_dt.mean())
    print('R2_score (train): ', r2_score_dt_train)
    print('R2_score (test): ', r2_score_dt_test)
    print("RMSE: ", rmse_dt)

    mae_dt = mean_absolute_error(y_test,y_pred_dt_test)
    med_ae_dt = median_absolute_error(y_test,y_pred_dt_test)
    print("MAE: ", mae_dt)
    print("MAE: ", med_ae_dt)
    one_results=pd.DataFrame([["Decision Tree", test_size  , rmse_dt, r2_score_dt_train,
                               r2_score_dt_test, cv_dt.mean(),mae_dt, med_ae_dt, time_dt]], 
                             columns=['Model',"Test Data Ratio" ,'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation', 'Mean Absolute Error (Normalized)', 'Median Absolute Error (Normalized)', 'Time Taken (ms)'])
    all_results=pd.concat([all_results, one_results])

CV:  0.8836253479788964
R2_score (train):  0.9999999998123285
R2_score (test):  0.9137369401456136
RMSE:  0.046981703666816754
MAE:  0.022989648699033385
MAE:  0.008795268417770702
CV:  0.8869107975029898
R2_score (train):  0.9999999997881653
R2_score (test):  0.9087931792003162
RMSE:  0.050195361201445764
MAE:  0.024463597746394592
MAE:  0.009214146122868222
CV:  0.8662092333794241
R2_score (train):  1.0
R2_score (test):  0.8946790207128216
RMSE:  0.05532630320897124
MAE:  0.027892060358641863
MAE:  0.010945501850256192
CV:  0.8716486002612033
R2_score (train):  1.0
R2_score (test):  0.8548825938123868
RMSE:  0.06440858469187295
MAE:  0.029558832271259786
MAE:  0.010651368456456037
CV:  0.8702535917705003
R2_score (train):  1.0
R2_score (test):  0.8775336377312215
RMSE:  0.059765243004754005
MAE:  0.028351262094201805
MAE:  0.010672218255940752
CV:  0.8614887318956812
R2_score (train):  1.0
R2_score (test):  0.876205842839829
RMSE:  0.05956002590461406
MAE:  0.028819797899697377
MAE: 

In [None]:
all_results

Unnamed: 0,Model,Test Data Ratio,RMSE,R2_Score(training),R2_Score(test),Cross-Validation,Mean Absolute Error (Normalized),Median Absolute Error (Normalized),Time Taken (ms)
0,Random Forest,10,0.032073,0.991218,0.959799,0.934191,0.016435,0.005865,6651.262945
0,Random Forest,20,0.03499,0.990614,0.955682,0.93134,0.017741,0.006377,5949.061769
0,Random Forest,30,0.039468,0.990337,0.946403,0.927379,0.019078,0.00633,5238.823618
0,Random Forest,40,0.045675,0.991266,0.927024,0.930086,0.020101,0.006879,4514.427254
0,Random Forest,50,0.045521,0.991454,0.928952,0.932722,0.020853,0.007346,3780.784292
0,Random Forest,60,0.046757,0.991482,0.923707,0.928703,0.021985,0.008124,3077.887392
0,Random Forest,70,0.048867,0.990549,0.916264,0.926206,0.023765,0.0087,2357.692716
0,Linear Model,10,0.085338,0.730929,0.715388,0.725844,0.055631,0.035728,4.550769
0,Linear Model,20,0.085136,0.727406,0.737625,0.720915,0.055371,0.036617,3.277947
0,Linear Model,30,0.086928,0.724495,0.740003,0.716301,0.055415,0.035723,3.110028


In [None]:
##Visualising
all_results=all_results.reset_index()
all_results.style.format('{:.6f}')

all_results.to_csv(drivePath+"/RANDOM_diff_ratio_train_test.csv", sep=',')

