In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MaxAbsScaler
import seaborn as sns
sns.set()
 
seed_val = 2021
np.random.seed(seed_val)
 
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [None]:
def check_representation(train_dataset, validation_dataset, test_dataset, label_name):
  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_vali = validation_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # check representation
  fig = plt.figure()
  fig, ax = plt.subplots(ncols=3, nrows=2, figsize=(15,12))

  ax[0][0].scatter(x=np.arange(len(sorted_train)), y=sorted_train[label_name], marker='.', alpha=0.4)
  ax[0][0].set_title("Index Sorted Training Data")
  ax[0][1].scatter(x=np.arange(len(sorted_vali)), y=sorted_vali[label_name], marker='.', alpha=0.4)
  ax[0][1].set_title("Index Sorted Validation Data")
  ax[0][2].scatter(x=np.arange(len(sorted_test)), y=sorted_test[label_name], marker='.', alpha=0.4)
  ax[0][2].set_title("Index Sorted Testing Data")
  ax[0][0].set_ylabel(label_name)

  train1 = sorted_train.sort_values(label_name)
  test1 = sorted_test.sort_values(label_name)

  y_val = train1[int(len(train1)/2)-1:int(len(train1)/2)][label_name]
  ax[1][0].scatter(x=np.arange(len(train_dataset)), y=train1[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][0].plot([len(train_dataset)/2, len(train_dataset)/2], [0,train_dataset[label_name].max()], '--', alpha=0.4, color="red")
  # horizontal line
  ax[1][0].plot([0, len(train_dataset)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][0].set_title("Label Sorted Training Data")

  sorted_ds=sorted_vali.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][1].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][1].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][1].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][1].set_title("Label Sorted Train Data")

  sorted_ds=sorted_test.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][2].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][2].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][2].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][2].set_title("Label Sorted Test Data")
  ax[1][0].set_ylabel(label_name)

In [None]:
train_dataset = pd.read_csv("Train.csv")
validation_dataset = pd.read_csv("Vali.csv")
test_dataset = pd.read_csv("Test.csv")
  
train_dataset.pop("Unnamed: 0")
validation_dataset.pop("Unnamed: 0")
test_dataset.pop("Unnamed: 0")

label_name = 'Nc'
in_features = 5
out_nodes = 1

check_representation(train_dataset, validation_dataset, test_dataset, label_name)

In [None]:
def get_results(model, X, Y, name, saveit=False, save_img=False, label_name="Nc", show_plots=False):
  print("Number of Samples:", len(X))
  predicted = model.predict(X)
  predicted2 = predicted[~np.isnan(predicted)]
  print("Number of Pred Samples:", len(predicted2))
  _MAE = abs(Y-predicted)
  _PE = (_MAE/Y)*100
  avg_pe = np.mean(_PE)
  
  history = pd.DataFrame(columns=["b1", "a2", "b2", "RatioTotalArea", "frac", "Labels", "Predictions", "MAE", "PE"])
  history["b1"] = X[:,0]
  history["a2"] = X[:,1]
  history["b2"] = X[:,2]
  history["RatioTotalArea"] = X[:,3]
  history["frac"] = X[:,4]
  history["Labels"] = Y
  history["Predictions"] = predicted
  history["MAE"] = _MAE
  history["PE"] = _PE

  if saveit:
    history.to_csv("{}.csv".format(name))
    print("CSV Saved.")

    filename = "{}.sav".format(name) 
    pickle.dump(model, open(filename, 'wb'))
    print("Model Saved.")

  if show_plots:
    fig2 = plt.figure()
    fig2, ax = plt.subplots(nrows=3, figsize=(15,16))
    ax[0].scatter(np.arange(len(predicted)), predicted, label="Prediction", alpha=0.4, marker='.')
    ax[0].scatter(np.arange(len(Y)), Y, label="TestData", marker='.')
    ax[0].legend()
    ax[0].set_title("Model Prediction")
    ax[0].set_ylabel(label_name)

    # label="Average = {}".format(mean_absolute_error(Y, predicted))
    ax[1].scatter(np.arange(len(_MAE)), _MAE, marker='.')
    ax[1].legend()
    ax[1].set_ylabel("MAE")
    ax[1].set_title("Mean Absolute Error")

    ax[2].scatter(np.arange(len(_PE)), _PE, marker='.', label="Max PE={x} \n Avg PE={y}".format(x=_PE.max(), y=avg_pe))
    ax[2].legend()
    ax[2].set_ylabel("PE")
    ax[2].set_xlabel("Sample")
    ax[2].set_title("Relative Percentage Error")

  if save_img:
    fig2.savefig("{}.png".format(name))
  
  return history

In [None]:
sorted_train = train_dataset.sort_values(label_name)
sorted_vali = validation_dataset.sort_values(label_name)
sorted_test = test_dataset.sort_values(label_name)

In [None]:
# Generate the inputs and labels

# TRAIN DATA
train = sorted_train
# train = sorted_train.sample(frac=1, random_state=seed_val)
train_features = train.to_numpy()[:,0:in_features]
train_labels = train.to_numpy()[:,in_features]
X = train_features.copy()
Y = train_labels.copy()

# TRAINING VALIDATION DATA
vali = sorted_vali
# vali = sorted_vali.sample(frac=1, random_state=seed_val)
vali_features = vali.to_numpy()[:,0:in_features]
vali_labels = vali.to_numpy()[:,in_features]
X_vali = vali_features.copy()
Y_vali = vali_labels.copy()

# TEST DATA
test = sorted_test
# test = sorted_test.sample(frac=1, random_state=seed_val)
test_features = test.sort_values(label_name).to_numpy()[:,0:in_features]
test_labels = test.sort_values(label_name).to_numpy()[:,in_features]
X_test = test_features.copy()
Y_test = test_labels.copy()

eval_set = [(X,Y), (X_vali, Y_vali)]
eval_set = [(X_test, Y_test), (X,Y), (X_vali, Y_vali)]

In [None]:
validation_dataset.head()

## Use Best Model
(Grid Search)

In [None]:
# this defines 1 grid to explore
param_grid1 = [{'tol':[1e-13]}]

# basic estimator
base_estimator = SVR(kernel='rbf', 
                     gamma='auto',
                     tol=1e-13,
                     epsilon=0.01,
                     C = 0.075,
                     cache_size=2000)

# Perform Grid Search
para_grid_search = GridSearchCV(base_estimator, 
                                param_grid1, 
                                n_jobs=-1, cv=10, 
                                verbose=2, 
                                return_train_score=True).fit(X, Y)

In [None]:
para_grid_search.best_params_

In [None]:
model = para_grid_search.best_estimator_
model

## Get Results
Run the 3rd cell after this to save the model. Not all Scikit can use save using the .savemodel in the fourth cell.

In [None]:
_NAME = "SVMModel"

In [None]:
#_______________________________________________________________________________________________________________________
# Get the results on the test or training data
_NAME2 = _NAME + "Train"

train_history = get_results(model, X.copy(), Y.copy(), name=_NAME2, saveit=False, save_img=True, show_plots=True)

In [None]:
#_______________________________________________________________________________________________________________________
# Save the model
_NAME2 = _NAME + "Vali"

vali_history = get_results(model, X_vali.copy(), Y_vali.copy(), name=_NAME2, saveit=False, save_img=True, show_plots=True)

In [None]:
#_______________________________________________________________________________________________________________________
# Save the model
_NAME2 = _NAME + "Test"

test_history = get_results(model, test_features.copy(), test_labels.copy(), name=_NAME2, saveit=True, save_img=True, show_plots=True)

In [None]:
model.save_model("{}".format(_NAME))