In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
import seaborn as sns
sns.set()

seed_val = 2021
np.random.seed(seed_val)

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error

In [None]:
def check_representation(train_dataset, validation_dataset, test_dataset, label_name):
  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_vali = validation_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # check representation
  fig = plt.figure()
  fig, ax = plt.subplots(ncols=3, nrows=2, figsize=(15,12))

  ax[0][0].scatter(x=np.arange(len(sorted_train)), y=sorted_train[label_name], marker='.', alpha=0.4)
  ax[0][0].set_title("Index Sorted Training Data")
  ax[0][1].scatter(x=np.arange(len(sorted_vali)), y=sorted_vali[label_name], marker='.', alpha=0.4)
  ax[0][1].set_title("Index Sorted Validation Data")
  ax[0][2].scatter(x=np.arange(len(sorted_test)), y=sorted_test[label_name], marker='.', alpha=0.4)
  ax[0][2].set_title("Index Sorted Testing Data")
  ax[0][0].set_ylabel(label_name)

  train1 = sorted_train.sort_values(label_name)
  test1 = sorted_test.sort_values(label_name)

  y_val = train1[int(len(train1)/2)-1:int(len(train1)/2)][label_name]
  ax[1][0].scatter(x=np.arange(len(train_dataset)), y=train1[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][0].plot([len(train_dataset)/2, len(train_dataset)/2], [0,train_dataset[label_name].max()], '--', alpha=0.4, color="red")
  # horizontal line
  ax[1][0].plot([0, len(train_dataset)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][0].set_title("Label Sorted Training Data")

  sorted_ds=sorted_vali.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][1].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][1].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][1].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][1].set_title("Label Sorted Train Data")

  sorted_ds=sorted_test.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][2].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][2].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][2].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][2].set_title("Label Sorted Test Data")
  ax[1][0].set_ylabel(label_name)

  ax[1][0].set_xlabel('Sample')
  ax[1][1].set_xlabel('Sample')
  ax[1][2].set_xlabel('Sample')

In [None]:
train_dataset = pd.read_csv("Train.csv")
validation_dataset = pd.read_csv("Vali.csv")
test_dataset = pd.read_csv("Test.csv")
  
train_dataset.pop("Unnamed: 0")
validation_dataset.pop("Unnamed: 0")
test_dataset.pop("Unnamed: 0")

label_name = 'Nc'
in_features = 5
out_nodes = 1

check_representation(train_dataset, validation_dataset, test_dataset, label_name)

In [None]:
def get_results(model, X, Y, name, saveit=False, save_img=False, label_name="Nc"):
  predicted = model.predict(X)
  print("Here")
  _MAE = abs(Y-predicted)
  _PE = (_MAE/Y)*100
  print("Here")
  if saveit:
    history = pd.DataFrame(columns=["Labels", "Predictions", "MAE", "PE"])
    history["Labels"] = Y
    history["Predictions"] = predicted
    history["MAE"] = _MAE
    history["PE"] = _PE

    history.to_csv("{}.csv".format(name))
    print("CSV Saved.")

    filename = "{}.sav".format(name) 
    pickle.dump(model, open(filename, 'wb'))
    print("Model Saved.")

  fig2 = plt.figure()
  fig2, ax = plt.subplots(nrows=3, figsize=(15,16))
  ax[0].scatter(np.arange(len(predicted)), predicted, label="Prediction", alpha=0.4, marker='.')
  ax[0].scatter(np.arange(len(Y)), Y, label="TestData", marker='.')
  ax[0].legend()
  ax[0].set_title("Model Prediction")
  ax[0].set_ylabel(label_name)

  ax[1].scatter(np.arange(len(_MAE)), _MAE, label="Average = {}".format(mean_absolute_error(Y, predicted)), marker='.')
  ax[1].legend()
  ax[1].set_ylabel("MAE")
  ax[1].set_title("Mean Absolute Error")

  ax[2].scatter(np.arange(len(_PE)), _PE, marker='.')
  ax[2].set_ylabel("PE")
  ax[2].set_title("Relative Percentage Error")

  if save_img:
    fig2.savefig("{}.png".format(name))
  


In [None]:
sorted_train = train_dataset.sort_values(label_name)
sorted_vali = validation_dataset.sort_values(label_name)
sorted_test = test_dataset.sort_values(label_name)

# Generate the inputs and labels

# TRAIN DATA
train = sorted_train
# train = sorted_train.sample(frac=1, random_state=seed_val)
train_features = train.to_numpy()[:,0:in_features]
train_labels = train.to_numpy()[:,in_features]
X = train_features.copy()
Y = train_labels.copy()

# TRAINING VALIDATION DATA
vali = sorted_vali
# vali = sorted_vali.sample(frac=1, random_state=seed_val)
vali_features = vali.to_numpy()[:,0:in_features]
vali_labels = vali.to_numpy()[:,in_features]
X_vali = vali_features.copy()
Y_vali = vali_labels.copy()

# TEST DATA
test = sorted_test
# test = sorted_test.sample(frac=1, random_state=seed_val)
test_features = test.sort_values(label_name).to_numpy()[:,0:in_features]
test_labels = test.sort_values(label_name).to_numpy()[:,in_features]
X_test = test_features.copy()
Y_test = test_labels.copy()

eval_set = [(X,Y), (X_vali, Y_vali)]
eval_set = [(X_test, Y_test), (X,Y), (X_vali, Y_vali)]

For Polynomial, run the cell below. Remember to select the degree. If doing Linear, DO NOT run it. 

In [None]:
degree = 10
poly_features = PolynomialFeatures(degree=degree, include_bias=True)
X = poly_features.fit_transform(X)
poly_features = PolynomialFeatures(degree=degree, include_bias=True)
X_vali = poly_features.fit_transform(X_vali)
poly_features = PolynomialFeatures(degree=degree, include_bias=True)
X_test = poly_features.fit_transform(X_test)

In [None]:
#_______________________________________________________________________________________________________________________
# Train the model
if len(train_dataset) > len(dataset):
  print("Noisey Training")
else:
  print("Non-Noisey Training")
print()

print("Training has started on {} samples.".format(len(train_dataset)))

# lambda is L2 reg and alpha is L1
model = LinearRegression().fit(X, Y)

# implementing cross valdiation
# kfold = KFold(n_splits=10, random_state=7, shuffle=True)
# results = cross_val_score(model, X, Y, cv=kfold)
# print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
#_______________________________________________________________________________________________________________________
# The Data
print("The Data")
fig = plt.figure()
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15,7))

ax[0].scatter(x=np.arange(len(train_dataset)), y=train_dataset[label_name], marker='.', alpha=0.4)
ax[1].scatter(x=np.arange(len(test_dataset)), y=test_dataset[label_name], marker='.', alpha=0.4)
ax[0].set_ylabel(label_name)
ax[0].set_title("Training Data")
ax[1].set_title("Testing Data")

## Get Results
Run the 3rd cell after this to save the model. Not all Scikit can use save using the .savemodel in the fourth cell.

In [None]:
_NAME = "PolyReg"

In [None]:
#_______________________________________________________________________________________________________________________
# Save the model
_NAME1 = _NAME+"Train"

get_results(model, X, Y, name=_NAME1, saveit=False, save_img=True)


In [None]:
#_______________________________________________________________________________________________________________________
# Save the model
_NAME2 = _NAME+"Vali"

get_results(model, X_vali, Y_vali, name=_NAME2, saveit=False, save_img=True)

In [None]:
#_______________________________________________________________________________________________________________________
# Save the model
_NAME3 = _NAME+"Test"

get_results(model, X_test, Y_test, name=_NAME3, saveit=True, save_img=True)

In [None]:
model.save_model("{}".format(_NAME))