# Machine Learning flow (To predict RunTime)
## Import relevant libraries and dataset
 - Dataset is created by PNRdatabase jupyter notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
data = pd.read_csv('PNRdatabase.csv') # load dataset (csv file) with pandas

# ignore unimportant warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

FileNotFoundError: [Errno 2] File PNRdatabase3.csv does not exist: 'PNRdatabase3.csv'

## Define features (X) and targets (y/z)
 - Define X as feature columns
 - Define y as target column RunTimeRoute
 - Define z as target column RunTimePlace

In [None]:
# Drop all columns which will not be used in machine learning model, leaving feature and target columns
train = data.drop(['Violations','AntennaViolations','Type40LP','MetalStack','Date','RowDirection','DoubleBack','FlipFirstRow','StartfromFirstRow'],axis=1)

# Drop target columns, leaving feature columns
X = train.drop(['RunTimeRoute','RunTimePlace'],axis=1)

# Define y and z to be target columns
y = train['RunTimeRoute']
z = train['RunTimePlace']

## Look at distribution of targets
 - Check for presence of possible outliers

In [None]:
sns.set_style('whitegrid')
fig,(ax1,ax2)=plt.subplots(1,2)
ax1.hist(y, range=(0, 500), bins=20)
ax2.hist(z, range=(100, 350), bins=20)
ax1.set_title('Distribution of RunTimeRoute')
ax2.set_title('Distribution of RunTimePlace')

## Find Outliers
 - seaborn boxplot function shows distribution of data
 - box shows quartiles of dataset while whiskers extend to the rest of the distribution, excluding points deemed as outliers by the function of the inter-quartile range

In [None]:
sns.boxplot(x=y)

In [None]:
sns.boxplot(x=z)

In [None]:
# Derive inter-quartile ranges of both RunTimeRoute and RunTimePlace
Q1_y = y.quantile(0.25)
Q3_y = y.quantile(0.75)
IQR_y = Q3_y - Q1_y
Q1_z = z.quantile(0.25)
Q3_z = z.quantile(0.75)
IQR_z = Q3_z - Q1_z
print(IQR_y,IQR_z)

In [None]:
# Print all possible outliers of RunTimeRoute which are outside inter-quartile range
y_outliers = []
for i in range(len(y)):
    if y[i] < (Q1_y - 1.5 * IQR_y) or y[i] > (Q3_y + 1.5 * IQR_y):
        y_outliers.append(i)
        print(i,y[i])
print(len(y_outliers))

In [None]:
y_outliers = [83,67,87] # outliers do not include small cluster of data points on the right of box (see box plot of RunTimeRoute)

In [None]:
# Print all possible outliers of RunTimePlace which are outside inter-quartile range
z_outliers = []
for i in range(len(z)):
    if z[i] < (Q1_z - 1.5 * IQR_z) or z[i] > (Q3_z + 1.5 * IQR_z):
        z_outliers.append(i)
print(z_outliers)

## Remove Outliers
 - Plot histogram of distribution excluding outliers

In [None]:
# drop outliers from all points
X_remove_outliers_route = X.drop(y_outliers)
X_remove_outliers_place = X.drop(z_outliers)
y_remove_outliers = y.drop(y_outliers)
z_remove_outliers = z.drop(z_outliers)

# plot histogram of distribution excluding outliers
sns.set_style('whitegrid')
fig,(ax1,ax2)=plt.subplots(1,2)
ax1.hist(y_remove_outliers, range=(0, 500), bins=20)
ax2.hist(z_remove_outliers, range=(100, 350), bins=20)
ax1.set_title('Distribution of RunTimeRoute')
ax2.set_title('Distribution of RunTimePlace')

## Standardize the data points to a normal distribution with mean 0 and variance 1
 - Necessary to ensure model assigns fair weight to features, without being affected by varying ranges and means of features

In [None]:
# convert pandas Dataframes to numpy arrays
# necessary to input into sklearn's StandardScaler
X_remove_outliers_route = X_remove_outliers_route.to_numpy()
X_remove_outliers_place = X_remove_outliers_place.to_numpy()
y_remove_outliers = y_remove_outliers.to_numpy()
z_remove_outliers = z_remove_outliers.to_numpy()

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X_place = StandardScaler()
sc_X_route = StandardScaler()
sc_Y = StandardScaler()
sc_Z = StandardScaler()
X_remove_outliers_place = sc_X_place.fit_transform(X_remove_outliers_place)
X_remove_outliers_route = sc_X_route.fit_transform(X_remove_outliers_route)
y_remove_outliers = sc_Y.fit_transform(y_remove_outliers.reshape(-1,1))
z_remove_outliers = sc_Z.fit_transform(z_remove_outliers.reshape(-1,1))
print(y_remove_outliers) # check that scaled RunTimeRoutes have a mean of 0

In [None]:
# convert numpy arrays back to pandas Dataframes
X_remove_outliers_route = pd.DataFrame(X_remove_outliers_route)
X_remove_outliers_place = pd.DataFrame(X_remove_outliers_place)
y_remove_outliers = pd.DataFrame(y_remove_outliers)
z_remove_outliers = pd.DataFrame(z_remove_outliers)

In [None]:
# attach column names to pandas Dataframes (column names are lost when converted to numpy arrays)
X_remove_outliers_route.columns = ['layercount', 'LayerM1', 'LayerM2', 'LayerM3', 'LayerM4', 'LayerM5',
       'LayerM6', 'LayerM7', 'LayerL1', 'LayerL2', 'LayerL3', 'LayerL4',
       'LayerBA', 'LayerBB', 'LayerBD', 'LayerBE', 'LayerBG', 'LayerFA',
       'LayerFB', 'LayerU3T', 'LayerU3A', 'LayerLB', 'ViaCounts', 'NetCount',
       'RowCount', 'CoreUtil', 'CoreWidth', 'CoreHeight', 'AspectRatio']
X_remove_outliers_place.columns = ['layercount', 'LayerM1', 'LayerM2', 'LayerM3', 'LayerM4', 'LayerM5',
       'LayerM6', 'LayerM7', 'LayerL1', 'LayerL2', 'LayerL3', 'LayerL4',
       'LayerBA', 'LayerBB', 'LayerBD', 'LayerBE', 'LayerBG', 'LayerFA',
       'LayerFB', 'LayerU3T', 'LayerU3A', 'LayerLB', 'ViaCounts', 'NetCount',
       'RowCount', 'CoreUtil', 'CoreWidth', 'CoreHeight', 'AspectRatio']
y_remove_outliers.columns = ['RunTimeRoute']
z_remove_outliers.columns = ['RunTimePlace']

## Split dataset into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

# split dataset into train set (80%) and test set (20%)
X_train_route, X_test_route, y_train, y_test = train_test_split(X_remove_outliers_route, y_remove_outliers, test_size=0.2,random_state=41)
X_train_place, X_test_place, z_train, z_test = train_test_split(X_remove_outliers_place, z_remove_outliers, test_size=0.2,random_state=41)

# check shape of pandas Dataframes (that train set is ~80% and test set is 20%)
print('X route shape:',X_remove_outliers_place.shape)
print('X place shape:',X_remove_outliers_route.shape)
print('y shape:',y_remove_outliers.shape)
print('z shape:',z_remove_outliers.shape)
print('X train route, X test route:',X_train_route.shape,X_test_route.shape)
print('X train place, X test place:',X_train_place.shape,X_test_place.shape)
print('y train, y test:',y_train.shape,y_test.shape)
print('z train, z test:',z_train.shape,z_test.shape)

## Select the best features with feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# function to return features with highest correlation to target
def select_columns(feature_cols,target_col,no_features=5):
    selector = SelectKBest(f_classif, k=no_features)
    X_new = selector.fit_transform(feature_cols,target_col)
    selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=X.columns)
    selected_columns = selected_features.columns[selected_features.var() != 0]
    return selected_columns

selected_columns_route = select_columns(train[X.columns],train['RunTimeRoute'],7)
selected_columns_place = select_columns(train[X.columns],train['RunTimePlace'],11)

# print list of features with highest correlation to target
print(selected_columns_place)
print(selected_columns_route)

## Find best regression model from scikit-learn using k-fold cross validation
- Metrics used: root mean squared error
- number of folds: 10

In [None]:
from sklearn.linear_model import LinearRegression    
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor   
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

k=7

# list of regression models from sklearn
Regressors = [
    LinearRegression(),
    Ridge(max_iter=10000,random_state=k),
    Lasso(max_iter=10000,random_state=k),
    ElasticNet(max_iter=10000,random_state=k),
    KNeighborsRegressor(),
    DecisionTreeRegressor(random_state=k),
    SVR(max_iter=10000),
    GradientBoostingRegressor(random_state=k),
    RandomForestRegressor(random_state=k)
]

from sklearn.model_selection import cross_val_score

# initialize a high best score for RunTimeRoute and RunTimePlace
best_score_route = 100
best_score_place = 100

# loop through regression model list to find the best scores
for regressor in Regressors:
    cv_results_route = -cross_val_score(regressor, X_train_route[selected_columns_route], y_train.values.ravel(), scoring='neg_root_mean_squared_error', cv=10 )
    cv_results_place = -cross_val_score(regressor, X_train_place[selected_columns_place], z_train.values.ravel(), scoring='neg_root_mean_squared_error', cv=10 )
    if best_score_route > cv_results_route.mean():
        best_score_route = cv_results_route.mean()
        best_regressor_route = regressor
    if best_score_place > cv_results_place.mean():
        best_score_place = cv_results_place.mean()
        best_regressor_place = regressor

# print best score and regression model
print('best score (route) is: ',best_score_route)
print('best regressor (route) is: ',best_regressor_route)
        
print('best score (place) is: ',best_score_place)
print('best regressor (place) is: ',best_regressor_place)

## Tune model's hyperparameters with GridSearchCV/RandomizedSearchCV
 - After getting best values of hyperparameters, train and test best models with best values of hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint

# define values of hyperparameters to be used in RandomizedSearchCv
param_dist = {"max_depth": [3, None],
              "max_features": randint(0, 6),
              "min_samples_leaf": randint(1, 9)}

# fit model with train set and carry out RandomizedSearchCV for RunTimeRoute
XGB = GradientBoostingRegressor(random_state=11)
XGB_cv = RandomizedSearchCV(XGB, param_dist,scoring='neg_root_mean_squared_error', cv=10)
XGB_cv.fit(X_train_route[selected_columns_route],y_train.values.ravel())
print(XGB_cv.best_estimator_)
print(-XGB_cv.cv_results_['mean_test_score'])

In [None]:
from sklearn import metrics

# Use tuned (best values) hyperparameters to train model with train set 
XGB_test = GradientBoostingRegressor(max_features=1,max_depth=3,min_samples_leaf=7,random_state=7)
XGB_test.fit(X_train_route,y_train.values.ravel())
y_pred = XGB_test.predict(X_test_route) # test model with test set
print(y_pred)
print(y_test)
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# plot graph of predited and actual values of RunTimeRoute
testlist = list(range(1,22))
plt.plot(testlist, y_pred)
plt.plot(testlist, y_test)
plt.xlabel('index')
plt.ylabel('y_pred/ y_test')
plt.legend(['y_pred','y_test'])

In [None]:
# define values of hyperparameters to be used in GridSearchCV
C_range = [1,10,100,1000,10000]
param_grid = dict(C= C_range)
svr = SVR(max_iter=10000)

# fit model with train set and carry out RandomizedSearchCV for RunTimePlace
search = GridSearchCV(estimator = svr, param_grid = param_grid, scoring='neg_root_mean_squared_error', cv=10)
search.fit(X_train_place[selected_columns_route],z_train.values.ravel())
print(search.best_estimator_)
print(-search.cv_results_['mean_test_score'])

In [None]:
# Use tuned (best values) hyperparameters to train model with train set 
svr_test = SVR(C=1,max_iter=10000)
svr_test.fit(X_train_place,z_train.values.ravel())
z_pred = svr_test.predict(X_test_place)
print(z_pred)
print(z_test)
print(np.sqrt(metrics.mean_squared_error(z_test, z_pred)))

In [None]:
# plot graph of predited and actual values of RunTimePlace
testlist = list(range(1,22))
plt.plot(testlist, z_pred)
plt.plot(testlist, z_test)
plt.xlabel('index')
plt.ylabel('z_pred/z_test')
plt.legend(['z_pred','z_test'])