# Extracting Model Details and Reverse Engineering of Predict function for Deployment

This Project is submitted for fulfillment of Post Graduate Program in Data Science and Engineering course offered by Great Lakes Institute of Management through Great Learning platform.

Submitted on : Feb 2022

Project Web Page :
    
https://sites.google.com/view/taxi-out

In [1]:
# type your code here

# import 'Pandas' 
import pandas as pd 

# import 'Numpy' 
import numpy as np

# import subpackage of Matplotlib
import matplotlib.pyplot as plt

# import 'Seaborn' 
import seaborn as sns

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# import train-test split 
from sklearn.model_selection import train_test_split

# import function to perform linear regression using OLS
import statsmodels.api as sm

# 'metrics' from sklearn is used for evaluating the model performance
from sklearn.metrics import mean_squared_error

# import function to perform linear regression
from sklearn.linear_model import LinearRegression

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox

In [2]:
# import SGDRegressor from sklearn to perform linear regression with stochastic gradient descent
from sklearn.linear_model import SGDRegressor

# import function for lasso regression
from sklearn.linear_model import Lasso

# import function for elastic net regression
from sklearn.linear_model import ElasticNet

# import function to perform GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

#importing several other functions
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import BayesianRidge, Lasso, LinearRegression, Ridge, RidgeCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR

In [3]:
# set the plot size using 'rcParams'
# once the plot size is set using 'rcParams', it sets the size of all the forthcoming plots in the file
# pass width and height in inches to 'figure.figsize' 
plt.rcParams['figure.figsize'] = [15,8]

#Display all columns while printing the dataframe
pd.set_option('display.width', 1200)
pd.set_option('display.max_columns', 25)


import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
df = pd.read_csv('M1_final.csv')

column_names = {"OP_UNIQUE_CARRIER":"CARRIER_CODE",
                "TAIL_NUM":"FLIGHT_NO",
                "DEST":"DESTINATION",
                "CRS_ELAPSED_TIME":"SCHEDULED_DURATION",
                "CRS_DEP_M":"SCHEDULED_DEPARTURE_TIME",
                "DEP_TIME_M":"ACTUAL_DEP_TIME",
                "CRS_ARR_M":"SCHEDULED_ARRIVAL_TIME",
                "sch_dep":"FLT_SCH_ARRIVAL",
                "sch_arr":"FLT_SCH_DEPARTURE"
               }

df = df.rename(column_names, axis=1)

df["Dew Point"] = df["Dew Point"].astype("int64")

df['Wind'].replace(np.nan,'W',inplace=True)

df.drop(["CARRIER_CODE"], axis=1, inplace=True)

df.drop(["FLIGHT_NO"], axis=1, inplace=True)

df["TOTAL_SCHEDULED"] = df["FLT_SCH_ARRIVAL"] + df["FLT_SCH_DEPARTURE"]
df.drop(["FLT_SCH_ARRIVAL"], axis=1, inplace=True)
df.drop(["FLT_SCH_DEPARTURE"], axis=1, inplace=True)

df.drop(["ACTUAL_DEP_TIME"], axis=1, inplace=True)

speed = df.DISTANCE / df.SCHEDULED_DURATION
df['AVG_SPEED'] = round(speed, 2)
df.drop('DISTANCE', axis=1, inplace=True)

df.MONTH.replace({11:0, 12:0.5,1:1}, inplace=True)

minmax = MinMaxScaler()
df.DAY_OF_MONTH = minmax.fit_transform(np.array(df['DAY_OF_MONTH']).reshape(-1,1))
df.DAY_OF_WEEK = minmax.fit_transform(np.array(df.DAY_OF_WEEK).reshape(-1,1))

df.DEP_DELAY = np.cbrt(df.DEP_DELAY)
df.SCHEDULED_DURATION = np.log(df.SCHEDULED_DURATION)

for x in df.index:
    if df['Wind Gust'][x]>0 :
        df['Wind Gust'][x]='YES'
    else: 
        df['Wind Gust'][x]='NO'

df.TOTAL_SCHEDULED = np.power(df.TOTAL_SCHEDULED,3)

In [6]:
df_Target_2class = pd.DataFrame(index=df.index, columns=['TAXI_OUT'])
for x in df.index:
    if df.TAXI_OUT[x] < 20:
        df_Target_2class['TAXI_OUT'][x] = '0'
    else:
        df_Target_2class['TAXI_OUT'][x] = '1'
    
df_Target_2class.value_counts()

TAXI_OUT
0           14446
1           14374
dtype: int64

In [7]:
df.TAXI_OUT = df_Target_2class

In [8]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

print('The updated data:',df.shape)

The updated data: (27903, 19)


In [9]:
X_scaler = StandardScaler()

for x in df.select_dtypes(include=np.number).drop(['MONTH','DAY_OF_MONTH','DAY_OF_WEEK'],axis=1).columns:
    df[x] = X_scaler.fit_transform(np.array(df[x]).reshape(-1,1))

df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DESTINATION,DEP_DELAY,SCHEDULED_DURATION,SCHEDULED_DEPARTURE_TIME,SCHEDULED_ARRIVAL_TIME,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,TAXI_OUT,TOTAL_SCHEDULED,AVG_SPEED
0,0.0,0.0,0.666667,CHS,-0.278335,-0.790591,-1.689047,-1.336559,0.831059,0.278731,-0.008036,W,2.180466,YES,-0.796221,Fair / Windy,0,-1.866982,0.09622
1,0.0,0.0,0.666667,LAX,-0.739114,1.146958,-1.63569,-1.096574,0.831059,0.278731,-0.008036,W,2.180466,YES,-0.796221,Fair / Windy,0,-1.866982,1.179003
2,0.0,0.0,0.666667,FLL,1.952524,-0.12192,-1.765748,-1.238252,0.831059,0.278731,-0.008036,W,2.180466,YES,-0.796221,Fair / Windy,1,-1.866982,0.644643
3,0.0,0.0,0.666667,MCO,-0.409523,-0.253691,-1.619016,-1.148619,0.831059,0.278731,-0.008036,W,2.180466,YES,-0.796221,Fair / Windy,0,-1.866982,0.440742
4,0.0,0.0,0.666667,ATL,-0.57481,-0.588703,-1.568993,-1.189098,0.573386,0.110955,-0.008036,W,2.01208,YES,-0.624496,Fair / Windy,0,-1.866982,0.335276


In [10]:
for x in df.select_dtypes(include='object').drop(['Wind Gust','TAXI_OUT'],axis=1).columns:
    df[x] = df[x].replace(df[x].value_counts(normalize=True))
    
df['Wind Gust'].replace({'YES':1, 'NO':0}, inplace=True)
df.TAXI_OUT.replace({'1':1, '0':0}, inplace=True)
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DESTINATION,DEP_DELAY,SCHEDULED_DURATION,SCHEDULED_DEPARTURE_TIME,SCHEDULED_ARRIVAL_TIME,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,TAXI_OUT,TOTAL_SCHEDULED,AVG_SPEED
0,0.0,0.0,0.666667,0.012974,-0.278335,-0.790591,-1.689047,-1.336559,0.831059,0.278731,-0.008036,0.11307,2.180466,1,-0.796221,0.027166,0,-1.866982,0.09622
1,0.0,0.0,0.666667,0.103824,-0.739114,1.146958,-1.63569,-1.096574,0.831059,0.278731,-0.008036,0.11307,2.180466,1,-0.796221,0.027166,0,-1.866982,1.179003
2,0.0,0.0,0.666667,0.034799,1.952524,-0.12192,-1.765748,-1.238252,0.831059,0.278731,-0.008036,0.11307,2.180466,1,-0.796221,0.027166,1,-1.866982,0.644643
3,0.0,0.0,0.666667,0.033007,-0.409523,-0.253691,-1.619016,-1.148619,0.831059,0.278731,-0.008036,0.11307,2.180466,1,-0.796221,0.027166,0,-1.866982,0.440742
4,0.0,0.0,0.666667,0.027667,-0.57481,-0.588703,-1.568993,-1.189098,0.573386,0.110955,-0.008036,0.11307,2.01208,1,-0.624496,0.027166,0,-1.866982,0.335276


In [11]:
y = df['TAXI_OUT']
x = df.drop(columns='TAXI_OUT')

print(y.shape)
print(x.shape)

(27903,)
(27903, 18)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size = 0.2)

print('X_train', X_train.shape)
print('y_train', y_train.shape)

print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (22322, 18)
y_train (22322,)
X_test (5581, 18)
y_test (5581,)


In [13]:
def classification_performance(model):
    print('Train - Classification Report : \n', classification_report(y_train, model.predict(X_train)))
    print()
    print('Test - Classification Report : \n', classification_report(y_test, model.predict(X_test)))

    scores = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5, scoring='f1_weighted', verbose=0)
    print('Cross Validation Scores : ', scores)

    bias = 1 - np.mean(scores)
    variance = np.std(scores)/np.mean(scores)
    print('Bias : ', bias)
    print('Variance : ', variance)

### Decision Tree Algorithm
For Learning Purpose only

In [14]:
Dtree_best = DecisionTreeClassifier(max_depth=12, random_state=1)
Dtree_best.fit(X_train, y_train)

classification_performance(Dtree_best)

Train - Classification Report : 
               precision    recall  f1-score   support

           0       0.72      0.79      0.75     11236
           1       0.76      0.69      0.73     11086

    accuracy                           0.74     22322
   macro avg       0.74      0.74      0.74     22322
weighted avg       0.74      0.74      0.74     22322


Test - Classification Report : 
               precision    recall  f1-score   support

           0       0.61      0.68      0.64      2786
           1       0.64      0.56      0.60      2795

    accuracy                           0.62      5581
   macro avg       0.62      0.62      0.62      5581
weighted avg       0.62      0.62      0.62      5581

Cross Validation Scores :  [0.61314744 0.61631832 0.59535216 0.60543054 0.61127384]
Bias :  0.39169554133517326
Variance :  0.012136382168151867


In [17]:
tree_structure = pd.DataFrame(columns=['Left_child', 'Right_child', 'Feature', 'Threshold', 'Impurity', 'n_class_0', 'n_class_1'])

tree_structure['Left_child'] = Dtree_best.tree_.children_left
tree_structure['Right_child'] = Dtree_best.tree_.children_right
tree_structure['Feature'] = Dtree_best.tree_.feature
tree_structure['Threshold'] = Dtree_best.tree_.threshold
tree_structure['Impurity'] = Dtree_best.tree_.impurity
tree_structure['n_class_0'] = [ x[0][0] for x in Dtree_best.tree_.value]
tree_structure['n_class_1'] = [ x[0][1] for x in Dtree_best.tree_.value]

tree_structure

Unnamed: 0,Left_child,Right_child,Feature,Threshold,Impurity,n_class_0,n_class_1
0,1,668,16,-1.166350,0.499977,11236.0,11086.0
1,2,539,5,1.251052,0.451067,2455.0,1285.0
2,3,252,6,-1.380576,0.439137,2312.0,1116.0
3,4,127,10,0.697722,0.379657,872.0,298.0
4,5,70,15,0.089130,0.331400,573.0,152.0
...,...,...,...,...,...,...,...
2252,2253,2256,9,1.872603,0.426903,21.0,47.0
2253,2254,2255,6,0.913780,0.400473,18.0,47.0
2254,-1,-1,-2,-2.000000,0.493827,12.0,15.0
2255,-1,-1,-2,-2.000000,0.265928,6.0,32.0


In [18]:
# Predict function of Decision tree

def DTree_Single_Predict(x):
    current_node = 0
    
    while True:
        if tree_structure.Left_child[current_node] == tree_structure.Right_child[current_node]:
            if tree_structure.n_class_0[current_node] >= tree_structure.n_class_1[current_node]:
                return 0
            else:
                return 1
        else:
            feature = x[tree_structure.Feature[current_node]]
            thres = tree_structure.Threshold[current_node]
            current_node = tree_structure.Left_child[current_node] if feature <= thres else tree_structure.Right_child[current_node]

In [19]:
def DTree_Predict(x):
    return [ DTree_Single_Predict(x.iloc[y].values) for y in range(x.shape[0]) ]

In [20]:
Function_predicted_class = np.array(DTree_Predict(X_test))
print(Function_predicted_class)

[0 0 0 ... 0 1 1]


In [21]:
Sklearn_predicted_class = Dtree_best.predict(X_test)
print(Sklearn_predicted_class)

[0 0 0 ... 0 1 1]


In [22]:
count = 0
for x in range(Sklearn_predicted_class.shape[0]):
    if Sklearn_predicted_class[x] == Function_predicted_class[x]:
        count+=1
print(count)

5581


## Gradient boosting
Optimized model - For Deployment

In [23]:
tuned_gb  = GradientBoostingClassifier(n_estimators=250,learning_rate=0.2,max_depth=4,random_state=10)  

tuned_gb.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.2, max_depth=4, n_estimators=250,
                           random_state=10)

In [24]:
classification_performance(tuned_gb)

Train - Classification Report : 
               precision    recall  f1-score   support

           0       0.77      0.81      0.79     11236
           1       0.79      0.75      0.77     11086

    accuracy                           0.78     22322
   macro avg       0.78      0.78      0.78     22322
weighted avg       0.78      0.78      0.78     22322


Test - Classification Report : 
               precision    recall  f1-score   support

           0       0.67      0.70      0.68      2786
           1       0.68      0.66      0.67      2795

    accuracy                           0.68      5581
   macro avg       0.68      0.68      0.68      5581
weighted avg       0.68      0.68      0.68      5581

Cross Validation Scores :  [0.66515714 0.6674565  0.66454657 0.67477546 0.66068589]
Bias :  0.3334756874144331
Variance :  0.006999222664049511


In [25]:
tuned_gb.estimators_[0][0].tree_.value.shape

(31, 1, 1)

In [26]:
# Gradient Boosted Decision Tree Structures as Dataframes

gb_structure = pd.DataFrame(columns=['estimator_id', 'node_id', 'Left_child', 'Right_child', 'Feature', 
                                     'Threshold', 'value'])

for x in range(tuned_gb.n_estimators_):
    
    estimator = tuned_gb.estimators_[x][0]
    tree_structure = pd.DataFrame(columns=['estimator_id', 'node_id', 'Left_child', 'Right_child', 'Feature', 
                                             'Threshold', 'value'])
    tree_structure['estimator_id'] = np.ones(shape=(estimator.tree_.node_count, )) * x
    tree_structure['node_id'] = np.arange(0, estimator.tree_.node_count)
    tree_structure['Left_child'] = estimator.tree_.children_left
    tree_structure['Right_child'] = estimator.tree_.children_right
    tree_structure['Feature'] = estimator.tree_.feature
    tree_structure['Threshold'] = np.round(estimator.tree_.threshold,8)
    tree_structure['value'] = [ np.round(x[0][0],8) for x in estimator.tree_.value]
    
    gb_structure = pd.concat(objs=(gb_structure, tree_structure), axis=0, ignore_index=True)
    
gb_structure

Unnamed: 0,estimator_id,node_id,Left_child,Right_child,Feature,Threshold,value
0,0.0,0,1,16,16,-1.166350,-0.000000
1,0.0,1,2,9,5,1.251052,-0.153057
2,0.0,2,3,6,6,-1.380576,-0.171086
3,0.0,3,4,5,10,0.697722,-0.241939
4,0.0,4,-1,-1,-2,-2.000000,-1.147991
...,...,...,...,...,...,...,...
7571,249.0,26,-1,-1,-2,-2.000000,0.066016
7572,249.0,27,-1,-1,-2,-2.000000,1.406803
7573,249.0,28,29,30,8,-1.681254,-0.000124
7574,249.0,29,-1,-1,-2,-2.000000,-0.092064


In [27]:
def GB_Single_Tree_Predict(x, est_id):
    current_node = 0
    gb_struct = gb_structure[gb_structure.estimator_id == est_id]
    gb_struct.set_index('node_id', inplace=True)
    
    while True:
        if gb_struct.Left_child[current_node] == gb_struct.Right_child[current_node]:
            return gb_struct.value[current_node]
        
        else:
            feature = x[gb_struct.Feature[current_node]]
            thres = gb_struct.Threshold[current_node]
            current_node = gb_struct.Left_child[current_node] if feature <= thres else gb_struct.Right_child[current_node]

In [28]:
def GB_Single_Row_Predict(x):
    pred_value = - 0.01343986
    lr = tuned_gb.learning_rate
    n = tuned_gb.n_estimators_
    
    for est in range(n):
        
        pred_value = pred_value + (lr * GB_Single_Tree_Predict(x, est))
        
    proba = np.exp(pred_value)/(1 + np.exp(pred_value))
        
    return 0 if proba < 0.5 else 1

In [29]:
def GB_Predict(x):
    return np.array([ GB_Single_Row_Predict(x.iloc[y].values) for y in range(x.shape[0]) ])

In [30]:
Sklearn_Predicted = tuned_gb.predict(X_test)[:10]
Sklearn_Predicted

array([1, 1, 0, 0, 0, 1, 0, 0, 0, 1], dtype=int64)

In [31]:
%%time
Function_Predicted = GB_Predict(X_test.iloc[:10])
Function_Predicted

Wall time: 2.06 s


array([1, 1, 0, 0, 0, 1, 0, 0, 0, 1])

In [33]:
Diff = Function_Predicted - Sklearn_Predicted
Diff

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [39]:
# Initial Prediction value
(tuned_gb.init_.class_prior_[1]-tuned_gb.init_.class_prior_[0])*2

-0.013439655944807782

In [40]:
def GB_structure_print(est_id):
    gb_struct = gb_structure[gb_structure.estimator_id == est_id]
    gb_struct.set_index('node_id', inplace=True)
    gb_struct.drop(columns='estimator_id', inplace=True)
    return gb_struct.values

In [41]:
def GB_Stuct_full_print():
    return [ GB_structure_print(x) for x in range(250) ]

In [42]:
# GB Structure as 3 Dimensional Array
# Copy and Paste the output array into your deployment environment

GB_Stuct_full_print()

[array([[1, 16, 16, -1.16635048, -0.0],
        [2, 9, 5, 1.25105232, -0.1530572],
        [3, 6, 6, -1.38057607, -0.17108583],
        [4, 5, 10, 0.69772205, -0.24193923],
        [-1, -1, -2, -2.0, -1.14799149],
        [-1, -1, -2, -2.0, -0.67423124],
        [7, 8, 6, -1.26052254, -0.13437259],
        [-1, -1, -2, -2.0, -0.05493889],
        [-1, -1, -2, -2.0, -0.66368529],
        [10, 13, 5, 1.31286019, 0.04502658],
        [11, 12, 17, 1.01728868, 0.12441255],
        [-1, -1, -2, -2.0, 1.26102135],
        [-1, -1, -2, -2.0, -0.36859946],
        [14, 15, 9, 0.06901078, -0.0786073],
        [-1, -1, -2, -2.0, 0.15043275],
        [-1, -1, -2, -2.0, -1.00701398],
        [17, 24, 15, 0.0891302, 0.03080583],
        [18, 21, 1, 0.94999999, 0.12750048],
        [19, 20, 15, 0.0060567, 0.11417504],
        [-1, -1, -2, -2.0, 1.2436845],
        [-1, -1, -2, -2.0, 0.39694855],
        [22, 23, 6, -1.2538529, 0.41665471],
        [-1, -1, -2, -2.0, 0.47499957],
        [-1, -1, -2, 

In [43]:
# Sample X_train features
X_test.iloc[:10].values

array([[ 0.5       ,  0.66666667,  0.83333333,  0.02996094, -0.57481045,
        -1.2066184 , -1.36890421, -1.13416176, -2.26101846, -1.1473657 ,
         0.71910862,  0.04547898, -1.18727156,  0.        ,  1.88267978,
         0.32340609, -1.43357953, -1.35217777],
       [ 0.        ,  0.86666667,  0.33333333,  0.01458625, -0.8234796 ,
         1.11813155,  0.531943  ,  0.76548376,  1.47524155,  1.6209394 ,
         1.31793327,  0.06565602,  1.67530571,  0.        , -1.65484264,
         0.01093072,  0.30276575,  1.20009621],
       [ 0.        ,  0.83333333,  0.16666667,  0.02734473, -0.63667702,
         0.28621258,  1.89921906, -1.914838  ,  0.95989534,  1.03372316,
         0.93297457,  0.02985342, -2.02920605,  0.        , -0.10932313,
         0.17750779, -1.68357632,  1.418059  ],
       [ 0.        ,  0.6       ,  0.16666667,  0.01075153, -0.8234796 ,
        -0.86335737,  0.328519  ,  0.39827678,  0.83105879,  0.53039496,
         0.33414992,  0.10608178, -0.51372397,  0.   

In [44]:
# Learning Rate
tuned_gb.learning_rate

0.2