In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MaxAbsScaler
import seaborn as sns
sns.set()

seed_val = 2021
np.random.seed(seed_val)
 
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
 

In [None]:
def check_representation(train_dataset, validation_dataset, test_dataset, label_name):
  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_vali = validation_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # check representation
  fig = plt.figure()
  fig, ax = plt.subplots(ncols=3, nrows=2, figsize=(15,12))

  ax[0][0].scatter(x=np.arange(len(sorted_train)), y=sorted_train[label_name], marker='.', alpha=0.4)
  ax[0][0].set_title("Index Sorted Training Data")
  ax[0][1].scatter(x=np.arange(len(sorted_vali)), y=sorted_vali[label_name], marker='.', alpha=0.4)
  ax[0][1].set_title("Index Sorted Validation Data")
  ax[0][2].scatter(x=np.arange(len(sorted_test)), y=sorted_test[label_name], marker='.', alpha=0.4)
  ax[0][2].set_title("Index Sorted Testing Data")
  ax[0][0].set_ylabel(label_name)

  train1 = sorted_train.sort_values(label_name)
  test1 = sorted_test.sort_values(label_name)

  y_val = train1[int(len(train1)/2)-1:int(len(train1)/2)][label_name]
  ax[1][0].scatter(x=np.arange(len(train_dataset)), y=train1[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][0].plot([len(train_dataset)/2, len(train_dataset)/2], [0,train_dataset[label_name].max()], '--', alpha=0.4, color="red")
  # horizontal line
  ax[1][0].plot([0, len(train_dataset)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][0].set_title("Label Sorted Training Data")

  sorted_ds=sorted_vali.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][1].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][1].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][1].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][1].set_title("Label Sorted Train Data")

  sorted_ds=sorted_test.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][2].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][2].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][2].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][2].set_title("Label Sorted Test Data")
  ax[1][0].set_ylabel(label_name)

In [None]:
uploaded2 = pd.read_csv("alldata.csv")
uploaded = pd.read_csv("AdditionalStuff.csv")


dataset = uploaded.copy()
dataset.pop("Unnamed: 0")

dataset2 = uploaded2.copy()
dataset2.pop("Unnamed: 0")

print("# new samples:", len(dataset)-len(dataset2))

In [None]:
x = dataset[dataset["Nc"] >= 91.3]
print("Max RTA: ",x["RatioTotalArea"].max())
print("Min RTA: ",x["RatioTotalArea"].min())

In [None]:
print("New # samples with Nc>=50: ", len(dataset[dataset["Nc"]>=50]))
print("Old # samples with Nc>=50: ", len(dataset2[dataset2["Nc"]>=50]))

In [None]:
train_dataset = pd.read_csv("Train.csv")
validation_dataset = pd.read_csv("Vali.csv")
test_dataset = pd.read_csv("Test.csv")
  
train_dataset.pop("Unnamed: 0")
validation_dataset.pop("Unnamed: 0")
test_dataset.pop("Unnamed: 0")

label_name = 'Nc'
in_features = 5
out_nodes = 1

check_representation(train_dataset, validation_dataset, test_dataset, label_name)

In [None]:
def SKLearn_get_results(model, X, Y, name, saveit=False, save_img=False, label_name="Nc", show_plots=False):
  print("Number of Samples:", len(X))
  predicted = model.predict(X)
  predicted2 = predicted[~np.isnan(predicted)]
  print("Number of Pred Samples:", len(predicted2))
  _MAE = abs(Y-predicted)
  _PE = (_MAE/Y)*100
  avg_pe = np.mean(_PE)
  
  history = pd.DataFrame(columns=["b1", "a2", "b2", "RatioTotalArea", "frac", "Labels", "Predictions", "MAE", "PE"])
  # history["b1"] = X[:,0]
  # history["a2"] = X[:,1]
  # history["b2"] = X[:,2]
  # history["RatioTotalArea"] = X[:,3]
  # history["frac"] = X[:,4]
  # history["Labels"] = Y
  # history["Predictions"] = predicted
  # history["MAE"] = _MAE
  # history["PE"] = _PE

  if saveit:
    history.to_csv("{}.csv".format(name))
    print("CSV Saved.")

    filename = "{}.sav".format(name) 
    pickle.dump(model, open(filename, 'wb'))
    print("Model Saved.")

  if show_plots:
    fig2 = plt.figure()
    fig2, ax = plt.subplots(nrows=3, figsize=(15,16))
    ax[0].scatter(np.arange(len(predicted)), predicted, label="Prediction", alpha=0.4, marker='.')
    ax[0].scatter(np.arange(len(Y)), Y, label="TestData", marker='.')
    ax[0].legend()
    ax[0].set_title("Model Prediction")
    ax[0].set_ylabel(label_name)

    # label="Average = {}".format(mean_absolute_error(Y, predicted))
    ax[1].scatter(np.arange(len(_MAE)), _MAE, marker='.')
    ax[1].legend()
    ax[1].set_ylabel("MAE")
    ax[1].set_title("Mean Absolute Error")

    ax[2].scatter(np.arange(len(_PE)), _PE, marker='.', label="Max PE={x} \n Avg PE={y}".format(x=_PE.max(), y=avg_pe))
    ax[2].legend()
    ax[2].set_ylabel("PE")
    ax[2].set_xlabel("Sample")
    ax[2].set_title("Relative Percentage Error")

  if save_img:
    fig2.savefig("{}.png".format(name))
  
  return history

In [None]:
def TF_get_results(model, X, Y, name, saveit=False, save_img=False, label_name="Nc", show_plots=False):
  print("Number of Samples:", len(X))
  predicted = model.predict(X)
  predicted2 = predicted[~np.isnan(predicted)]
  print("Number of Pred Samples:", len(predicted2))
  Y = np.reshape(Y, (-1,1))
  _MAE = abs(Y-predicted)
  _PE = (_MAE/Y)*100
  avg_pe = np.mean(_PE)
  
  history = pd.DataFrame(columns=["b1", "a2", "b2", "RatioTotalArea", "frac", "Labels", "Predictions", "MAE", "PE"])
  # history["b1"] = X[:,0]
  # history["a2"] = X[:,1]
  # history["b2"] = X[:,2]
  # history["RatioTotalArea"] = X[:,3]
  # history["frac"] = X[:,4]
  # history["Labels"] = Y
  # history["Predictions"] = predicted
  # history["MAE"] = _MAE
  # history["PE"] = _PE

  if saveit:
    history.to_csv("{}.csv".format(name))
    print("CSV Saved.")

    filename = "{}.sav".format(name) 
    pickle.dump(model, open(filename, 'wb'))
    print("Model Saved.")

  if show_plots:
    fig2 = plt.figure()
    fig2, ax = plt.subplots(nrows=3, figsize=(15,16))
    ax[0].scatter(np.arange(len(predicted)), predicted, label="Prediction", alpha=0.4, marker='.')
    ax[0].scatter(np.arange(len(Y)), Y, label="TestData", marker='.')
    ax[0].legend()
    ax[0].set_title("Model Prediction")
    ax[0].set_ylabel(label_name)

    label="Average = {}".format(mean_absolute_error(Y, predicted))
    ax[1].scatter(np.arange(len(_MAE)), _MAE, marker='.')
    ax[1].legend()
    ax[1].set_ylabel("MAE")
    ax[1].set_title("Mean Absolute Error")

    ax[2].scatter(np.arange(len(_PE)), _PE, marker='.', label="Max PE={x} \n Avg PE={y}".format(x=_PE.max(), y=avg_pe))
    ax[2].legend()
    ax[2].set_ylabel("PE")
    ax[2].set_xlabel("Sample")
    ax[2].set_title("Relative Percentage Error")

  if save_img:
    fig2.savefig("{}.png".format(name))
                                          
  return history

In [None]:
sorted_train = train_dataset.sort_values(label_name)
sorted_vali = validation_dataset.sort_values(label_name)
sorted_test = test_dataset.sort_values(label_name)

In [None]:
# Generate the inputs and labels

# TRAIN DATA
train = sorted_train
# train = sorted_train.sample(frac=1, random_state=seed_val)
train_features = train.to_numpy()[:,0:in_features]
train_labels = train.to_numpy()[:,in_features]
X = train_features.copy()
Y = train_labels.copy()

# TRAINING VALIDATION DATA
vali = sorted_vali
# vali = sorted_vali.sample(frac=1, random_state=seed_val)
vali_features = vali.to_numpy()[:,0:in_features]
vali_labels = vali.to_numpy()[:,in_features]
X_vali = vali_features.copy()
Y_vali = vali_labels.copy()

# TEST DATA
test = sorted_test
# test = sorted_test.sample(frac=1, random_state=seed_val)
test_features = test.sort_values(label_name).to_numpy()[:,0:in_features]
test_labels = test.sort_values(label_name).to_numpy()[:,in_features]
X_test = test_features.copy()
Y_test = test_labels.copy()

eval_set = [(X,Y), (X_vali, Y_vali)]
eval_set = [(X_test, Y_test), (X,Y), (X_vali, Y_vali)]

In [None]:
validation_dataset.head()

## Get Results

In [None]:
from tensorflow import keras

In [None]:
XGBmodel = pickle.load(open("XGBModelTest.sav", 'rb'))
RFmodel = pickle.load(open("RFModelTest.sav", 'rb'))
NNmodel = keras.models.load_model("NNModel4")
KNNmodel = pickle.load(open("KNNModelTest.sav", 'rb'))
SVMmodel = pickle.load(open("SVMModelTest.sav", 'rb'))

In [None]:
NAME = "XGB"
XGBtest_history = SKLearn_get_results(XGBmodel, X.copy(), Y.copy(), name=NAME+"Train", saveit=False, save_img=True, show_plots=True)
XGBtest_history = SKLearn_get_results(XGBmodel, X_vali.copy(), Y_vali.copy(), name=NAME+"Vali", saveit=False, save_img=True, show_plots=True)
XGBtest_history = SKLearn_get_results(XGBmodel, X_test.copy(), Y_test.copy(), name=NAME+"Test", saveit=False, save_img=True, show_plots=True)

In [None]:
NAME = "RF"
RFtest_history = SKLearn_get_results(RFmodel, X.copy(), Y.copy(), name=NAME+"Train", saveit=False, save_img=True, show_plots=True)
RFtest_history = SKLearn_get_results(RFmodel, X_vali.copy(), Y_vali.copy(), name=NAME+"Vali", saveit=False, save_img=True, show_plots=True)
RFtest_history = SKLearn_get_results(RFmodel, X_test.copy(), Y_test.copy(), name=NAME+"Test", saveit=False, save_img=True, show_plots=True)

In [None]:
NAME = "NN"
NNtest_history = TF_get_results(NNmodel, X.copy(), Y.copy(), name=NAME+"Train", saveit=False, save_img=True, show_plots=True)
NNtest_history = TF_get_results(NNmodel, X_vali.copy(), Y_vali.copy(), name=NAME+"Vali", saveit=False, save_img=True, show_plots=True)
NNtest_history = TF_get_results(NNmodel, X_test.copy(), Y_test.copy(), name=NAME+"Test", saveit=False, save_img=True, show_plots=True)

In [None]:
NAME = "KNN"
KNNtest_history = SKLearn_get_results(KNNmodel, X.copy(), Y.copy(), name=NAME+"Train", saveit=False, save_img=True, show_plots=True)
KNNtest_history = SKLearn_get_results(KNNmodel, X_vali.copy(), Y_vali.copy(), name=NAME+"Vali", saveit=False, save_img=True, show_plots=True)
KNNtest_history = SKLearn_get_results(KNNmodel, X_test.copy(), Y_test.copy(), name=NAME+"Test", saveit=False, save_img=True, show_plots=True)

In [None]:
NAME = "SVM"
SVMtest_history = SKLearn_get_results(SVMmodel, X.copy(), Y.copy(), name=NAME+"Train", saveit=False, save_img=True, show_plots=True)
SVMtest_history = SKLearn_get_results(SVMmodel, X_vali.copy(), Y_vali.copy(), name=NAME+"Vali", saveit=False, save_img=True, show_plots=True)
SVMtest_history = SKLearn_get_results(SVMmodel, X_test.copy(), Y_test.copy(), name=NAME+"Test", saveit=False, save_img=True, show_plots=True)

# Metrics
R scores are probably not useful in non-linear regression, but they are given here (not presented in the report).

In [None]:
from sklearn.metrics import r2_score  
from sklearn.metrics import mean_squared_error  

In [None]:
nntrain_pred = NNmodel.predict(X)
nnvali_pred = NNmodel.predict(X_vali)
nntest_pred = NNmodel.predict(X_test)

In [None]:
print("Train r2 score : ", r2_score(Y, nntrain_pred))
print("Vali r2 score  : ", r2_score(Y_vali, nnvali_pred))
print("Test r2 score  : ", r2_score(Y_test, nntest_pred))
print()
print("Train RMSE score : ", mean_squared_error(Y, nntrain_pred, squared=False))
print("Vali RMSE score  : ", mean_squared_error(Y_vali, nnvali_pred, squared=False))
print("Test RMSE score  : ", mean_squared_error(Y_test, nntest_pred, squared=False))

In [None]:
rftrain_pred = RFmodel.predict(X)
rfvali_pred = RFmodel.predict(X_vali)
rftest_pred = RFmodel.predict(X_test)

In [None]:
print("Train r2 score : ", r2_score(Y, rftrain_pred))
print("Vali r2 score  : ", r2_score(Y_vali, rfvali_pred))
print("Test r2 score  : ", r2_score(Y_test, rftest_pred))
print()
print("Train RMSE score : ", mean_squared_error(Y, rftrain_pred, squared=False))
print("Vali RMSE score  : ", mean_squared_error(Y_vali, rfvali_pred, squared=False))
print("Test RMSE score  : ", mean_squared_error(Y_test, rftest_pred, squared=False))

In [None]:
xgbtrain_pred = XGBmodel.predict(X)
xgbvali_pred = XGBmodel.predict(X_vali)
xgbtest_pred = XGBmodel.predict(X_test)

In [None]:
print("Train r2 score : ", r2_score(Y, xgbtrain_pred))
print("Vali r2 score  : ", r2_score(Y_vali, xgbvali_pred))
print("Test r2 score  : ", r2_score(Y_test, xgbtest_pred))
print()
print("Train RMSE score : ", mean_squared_error(Y, xgbtrain_pred, squared=False))
print("Vali RMSE score  : ", mean_squared_error(Y_vali, xgbvali_pred, squared=False))
print("Test RMSE score  : ", mean_squared_error(Y_test, xgbtest_pred, squared=False))

In [None]:
knntrain_pred = KNNmodel.predict(X)
knnvali_pred = KNNmodel.predict(X_vali)
knntest_pred = KNNmodel.predict(X_test)

In [None]:
print("Train r2 score : ", r2_score(Y, knntrain_pred))
print("Vali r2 score  : ", r2_score(Y_vali, knnvali_pred))
print("Test r2 score  : ", r2_score(Y_test, knntest_pred))
print()
print("Train RMSE score : ", mean_squared_error(Y, knntrain_pred, squared=False))
print("Vali RMSE score  : ", mean_squared_error(Y_vali, knnvali_pred, squared=False))
print("Test RMSE score  : ", mean_squared_error(Y_test, knntest_pred, squared=False))

In [None]:
svmtrain_pred = SVMmodel.predict(X)
svmvali_pred = SVMmodel.predict(X_vali)
svmtest_pred = SVMmodel.predict(X_test)

In [None]:
print("Train r2 score : ", r2_score(Y, svmtrain_pred))
print("Vali r2 score  : ", r2_score(Y_vali, svmvali_pred))
print("Test r2 score  : ", r2_score(Y_test, svmtest_pred))
print()
print("Train RMSE score : ", mean_squared_error(Y, svmtrain_pred, squared=False))
print("Vali RMSE score  : ", mean_squared_error(Y_vali, svmvali_pred, squared=False))
print("Test RMSE score  : ", mean_squared_error(Y_test, svmtest_pred, squared=False))