### Binary Classification Models

In [172]:
# Import Statements

import pandas as pd
import numpy as np
import sys
import math

from sklearn import metrics, svm
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

import statsmodels.api as sm

In [47]:
# Box Folder Location containing data files

apple_folder_loc = "~/Library/CloudStorage/Box-Box/Capstone/Capstone/Data Science Capstone/Data/FeatureSelection"
windows_folder_loc = "~/Box/Capstone/Capstone/Data Science Capstone/Data/FeatureSelection"
linux_folder_loc = ""

folder_loc = apple_folder_loc if sys.platform.startswith("darwin") else (windows_folder_loc if sys.platform.startswith("win") else linux_folder_loc)

In [50]:
# Fetch Data

# file_name = "InputDataImputed_MICE_WithRace.xlsx"
file_name = "chi10.xlsx"

data = pd.read_excel("{}/{}".format(folder_loc, file_name))
data.head(10)

Unnamed: 0,STAB,O2,BVM,ETT,SUCK,NEEDLE,ChestTube,EmergentIntubation,ICP,Craniotomy,ActivationLevel
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0,1
8,0,0,1,0,1,0,0,1,0,0,1
9,0,0,0,0,0,0,0,0,0,0,0


In [52]:
# Initializations

random_state = 265

In [53]:
# Train-Test Split

data_features, data_labels = data.iloc[:,:-1], data.ActivationLevel
X_train, X_test, y_train, y_test = train_test_split(data_features, data_labels, test_size=0.30, random_state=random_state)

### Logistic Regression

In [58]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=random_state)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [59]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.80


In [57]:
# To get the summary report

logit_model = sm.Logit(data_labels, data_features)
result = logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.636941
         Iterations 7
                          Results: Logit
Model:               Logit             Pseudo R-squared:  -0.219   
Dependent Variable:  ActivationLevel   AIC:               6565.2032
Date:                2023-02-27 07:38  BIC:               6630.6474
No. Observations:    5138              Log-Likelihood:    -3272.6  
Df Model:            9                 LL-Null:           -2683.8  
Df Residuals:        5128              LLR p-value:       1.0000   
Converged:           1.0000            Scale:             1.0000   
No. Iterations:      7.0000                                        
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
STAB               -2.6611   0.1812 -14.6832 0.0000 -3.0164 -2.3059
O2                 -0.6587   0.0709  -9.2

### SVM

In [93]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

print('\nw = ',clf.coef_)
print('\nb = ',clf.intercept_)
print('\nIndices of support vectors = ', clf.support_)
print('\nSupport vectors = ', clf.support_vectors_)
print('\nNumber of support vectors for each class = ', clf.n_support_)
print('\nCoefficients of the support vector in the decision function = ', np.abs(clf.dual_coef_))

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

# Model Accuracy: how often is the classifier correct?
print("\n-------\n\nAccuracy:{}\n\n-------".format(metrics.accuracy_score(y_test, y_pred)))


w =  [[-2.48573266e-03  6.22745394e-04 -8.38742238e-06  2.58835919e-04
   5.99268611e-01  1.39923689e+00  1.81929097e-03  6.00131979e-01
   1.40027433e+00  1.39988937e+00]]

b =  [-1.00022917]

Indices of support vectors =  [   1   10   17 ... 3586 3589 3590]

Support vectors =  [[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 1. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 1. 0. 0.]]

Number of support vectors for each class =  [753 753]

Coefficients of the support vector in the decision function =  [[1. 1. 1. ... 1. 1. 1.]]

R-square:  -5.102598566308242

Adjusted R-square:  -0.22327905731224895

-------

Accuracy:0.7937743190661478

-------


In [168]:
#Create a svm Classifier
clf = svm.SVC(kernel='poly', degree=2) # Polynomial Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8073929961089494


In [71]:
#Create a svm Classifier
clf = svm.SVC(kernel='rbf') # RBF Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8022049286640727


In [169]:
# Grid search

parameters= {'kernel': ('linear', 'rbf', 'poly'),
             'C': [1, 10, 100]}
gridsearch = GridSearchCV(svm.SVC(), parameters)
gridsearch.fit(X_train, y_train)

print(gridsearch.best_params_)

y_pred = gridsearch.predict(X_test)
y_pred = np.round(y_pred).astype(int)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

{'C': 1, 'kernel': 'poly'}
Accuracy: 0.8035019455252919

R-square:  -1.941469771658451

Adjusted R-square:  -0.16557721498619982


### KNN

#### KNN Regressor

In [144]:
# K Neighbors Regressor

knn_model = KNeighborsRegressor(n_neighbors=45)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)
y_pred = np.round(y_pred).astype(int)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

Accuracy: 0.7996108949416343

R-square:  -2.4661768450150934

Adjusted R-square:  -0.18865795191661938


In [145]:
# Grid search for best number of neighbors

parameters = {"n_neighbors": range(1, 50)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)

print(gridsearch.best_params_)

y_pred = gridsearch.predict(X_test)
y_pred = np.round(y_pred).astype(int)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

{'n_neighbors': 45}
Accuracy: 0.7996108949416343

R-square:  -2.4661768450150934

Adjusted R-square:  -0.18865795191661938


In [146]:
# Grid Search for best weight parameter

parameters = {
    "n_neighbors": range(1, 50),
    "weights": ["uniform", "distance"],
}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)

print(gridsearch.best_params_)

y_pred = gridsearch.predict(X_test)
y_pred = np.round(y_pred).astype(int)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

{'n_neighbors': 45, 'weights': 'uniform'}
Accuracy: 0.7996108949416343

R-square:  -2.4661768450150934

Adjusted R-square:  -0.18865795191661938


In [147]:
# Bagging Regressor

best_k = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(
    n_neighbors=best_k, weights=best_weights
)

bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)
bagging_model.fit(X_train, y_train)

test_preds_grid = bagging_model.predict(X_test)
y_pred = np.round(test_preds_grid).astype(int)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

Accuracy: 0.8041504539559015

R-square:  -3.3998866213151935

Adjusted R-square:  -0.16173042549779626


#### KNN Classifier

In [153]:
# K Neighbors Classifier

classifier = KNeighborsClassifier(n_neighbors = 2)
classifier.fit(X_train, y_train)

print("{} Features used during classification: {}".format(classifier.n_features_in_, classifier.feature_names_in_))

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("\nAccuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

10 Features used during classification: ['STAB' 'O2' 'BVM' 'ETT' 'SUCK' 'NEEDLE' 'ChestTube' 'EmergentIntubation'
 'ICP' 'Craniotomy']

Accuracy: 0.7931258106355382

R-square:  -4.2791783380018655

Adjusted R-square:  -0.2271258468006525


In [154]:
# Grid search for best number of neighbors and weights

parameters = {
    "n_neighbors": range(1, 50),
    "weights": ["uniform", "distance"],
}
gridsearch = GridSearchCV(KNeighborsClassifier(), parameters)
gridsearch.fit(X_train, y_train)

print("{} Features used during classification: {}".format(gridsearch.n_features_in_, gridsearch.feature_names_in_))

print(gridsearch.best_params_)

y_pred = gridsearch.predict(X_test)
y_pred = np.round(y_pred).astype(int)

# Model Accuracy: how often is the classifier correct?
print("\nAccuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

10 Features used during classification: ['STAB' 'O2' 'BVM' 'ETT' 'SUCK' 'NEEDLE' 'ChestTube' 'EmergentIntubation'
 'ICP' 'Craniotomy']
{'n_neighbors': 3, 'weights': 'uniform'}
Accuracy: 0.7937743190661478

R-square:  -3.1925102599179205

Adjusted R-square:  -0.22327905731224895


In [155]:
# Bagging Classifier

best_k = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]
bagged_knn = KNeighborsClassifier(
    n_neighbors=best_k, weights=best_weights
)

bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)
bagging_model.fit(X_train, y_train)

print("{} Features used during classification: {}".format(bagging_model.n_features_in_, bagging_model.feature_names_in_))

test_preds_grid = bagging_model.predict(X_test)
y_pred = np.round(test_preds_grid).astype(int)

# Model Accuracy: how often is the classifier correct?
print("\nAccuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))

10 Features used during classification: ['STAB' 'O2' 'BVM' 'ETT' 'SUCK' 'NEEDLE' 'ChestTube' 'EmergentIntubation'
 'ICP' 'Craniotomy']

Accuracy: 0.8015564202334631

R-square:  -2.0998843748357587

Adjusted R-square:  -0.17711758345140938


### RandomForest

In [160]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = random_state)

# Train the model on training data
rf.fit(X_train, y_train)

# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
y_pred = np.round(predictions).astype(int)

# Model Accuracy: how often is the classifier correct?
print("\nAccuracy:",metrics.accuracy_score(y_test, y_pred))

print('\nR-square: ', metrics.r2_score(y_pred, y_test))
print('\nAdjusted R-square: ', 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1)))


Accuracy: 0.8002594033722439

R-square:  -1.9653846153846155

Adjusted R-square:  -0.18481116242821627


In [162]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(data_features.columns, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: ChestTube            Importance: 0.29
Variable: EmergentIntubation   Importance: 0.21
Variable: O2                   Importance: 0.11
Variable: STAB                 Importance: 0.1
Variable: BVM                  Importance: 0.07
Variable: SUCK                 Importance: 0.07
Variable: ETT                  Importance: 0.05
Variable: ICP                  Importance: 0.05
Variable: NEEDLE               Importance: 0.03
Variable: Craniotomy           Importance: 0.03


In [173]:
# Define the model
model = RandomForestClassifier()

# Evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=random_state)
n_scores = cross_val_score(model, data_features, data_labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# Report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.803 (0.015)


In [176]:
# Exploring number of Samples

# get a list of models to evaluate
def get_models():
    models = dict()
    # explore ratios from 10% to 100% in 10% increments
    for i in np.arange(0.1, 1.1, 0.1):
        key = '%.1f' % i
        # set max_samples=None to use 100%
        if i == 1.0:
            i = None
        models[key] = RandomForestClassifier(max_samples=i)
    return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    # define the evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate the model and collect the results
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores
 
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    # evaluate the model
    scores = evaluate_model(model, data_features, data_labels)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize the performance along the way
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

>0.1 0.804 (0.012)
>0.2 0.804 (0.013)
>0.3 0.803 (0.013)
>0.4 0.803 (0.013)
>0.5 0.803 (0.013)
>0.6 0.804 (0.013)
>0.7 0.803 (0.013)
>0.8 0.803 (0.013)
>0.9 0.802 (0.013)
>1.0 0.804 (0.013)


### Naive-Bayes