### Binary Classification Models

In [1]:
# Import Statements

import pandas as pd
import numpy as np
import sys
import math
import warnings
import joblib
import os
import glob

from sklearn import metrics, svm
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import statsmodels.api as sm

from imblearn.over_sampling import RandomOverSampler 

import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

### Initializations

In [2]:
# Box Folder Location containing data files

home_dir = os.path.expanduser('~')

apple_folder_loc = home_dir + "/Library/CloudStorage/Box-Box/Capstone/Capstone/Data Science Capstone/Data"
windows_folder_loc = home_dir + "~/Box/Capstone/Capstone/Data Science Capstone/Data"
linux_folder_loc = ""

data_folder_loc = apple_folder_loc if sys.platform.startswith("darwin") else (windows_folder_loc if sys.platform.startswith("win") else linux_folder_loc)

In [3]:
folder_to_read_for_input_files = "FeatureSelection"
folder_to_save_model = "Models"
random_state = 265
reporting_df = pd.DataFrame(columns=['File Name', 'Model', 'Accuracy', 'Precision_Recall', 'R-Square', 'Adjusted-R-Square'])

### Functions to automate

In [4]:
def load_dataset(file_path):
    
    print("\nLoading File {}\n".format(file_path.split("/")[-1]))
    
    data = pd.read_excel(file_path)
    return data

In [5]:
def perform_train_test_split(X, Y, test_size=0.30, random_state=1):
    
    print("Performing {}/{} Train-Test Split".format((1-test_size) * 100, test_size * 100))
    return train_test_split(X, Y, test_size=test_size, random_state=random_state)

In [6]:
def get_classification_report(X_test, y_test, y_pred):
    
    print("\nFinding Classification Report\n")
    
    r_sq = metrics.r2_score(y_pred, y_test)
    adj_r_sq = 1 - (1 - metrics.r2_score(y_test, y_pred)) * ((len(X_test) - 1) / (len(X_test) - X_test.shape[1] - 1))
    
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    report['R-Square'] = r_sq
    report['Adjusted-R-Square'] = adj_r_sq
    
    return report

In [7]:
def save_model(model, file_name, model_name):
    
    print("\nSaving model {} \n".format(model_name))
    
    joblib.dump(model, "{}/{}/{}_{}".format(data_folder_loc, folder_to_save_model, file_name.split(".xlsx")[0], model_name))

In [8]:
def load_model(model, model_name):
    
    print("\nLoading Model {}\n", model_name)
    
    return joblib.load("{}/{}/{}".format(data_folder_loc, folder_to_save_model, model_name))

In [9]:
def make_report(X_test, y_test, y_pred, model, model_name, file_name, reporting_df):
    classification_report = get_classification_report(X_test, y_test, y_pred)
    reporting_df = reporting_df.append(
        {
            'File Name': file_name,
            'Model': model_name,
            'Accuracy': classification_report['accuracy'],
            'Precision_Recall': {'0': classification_report['0'], '1': classification_report['1']},
            'R-Square': classification_report['R-Square'],
            'Adjusted-R-Square': classification_report['Adjusted-R-Square']
        },
        ignore_index=True
    )
    save_model(model, file_name, model_name)
    return reporting_df

### OverSampling

In [10]:
def perform_oversampling():
    
    print("\nPerforming Oversampling since data labels are distributed as follows:\n")
    data_labels.value_counts()
    
    return RandomOverSampler().fit_resample(data_features, data_labels)

### Stratified Sampling with Cross Validation

In [11]:
def perform_stratified_sampling(X, y, model, n_splits=10, n_repeats=3, n_jobs=-1):
    
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
    n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=n_jobs, error_score='raise')

    print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

### Bagging Regressor

In [12]:
def perform_bagging(X_train, y_train, X_test, y_test, model, n_estimators, file_name, reporting_df):
    
    print("\nPerforming Bagging Regression\n")
    
    bagging_model = BaggingRegressor(model, n_estimators=n_estimators)
    bagging_model.fit(X_train, y_train)

    test_preds_grid = bagging_model.predict(X_test)
    y_pred = np.round(test_preds_grid).astype(int)
    
    return make_report(X_test, y_test, y_pred, bagging_model, "Bagging Regressor", file_name, reporting_df)

### Logistic Regression

In [13]:
def perform_logistic_regression(X_train, y_train, X_test, y_test, file_name, reporting_df):
    
    print("\nPerforming Logistic Regression\n")
    
    logreg = LogisticRegression(random_state=random_state)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    logit_model = sm.Logit(data_labels, data_features)
    result = logit_model.fit()
    print("\nSummary for Logit Model:\n\n", result.summary2())
    
    return make_report(X_test, y_test, y_pred, logreg, "Logistic Regression", file_name, reporting_df)

### SVM

In [14]:
def perform_svm(X_train, y_train, X_test, y_test, file_name, reporting_df):
    
    print("\nPerforming SVM Classification\n")
    
    parameters= {'kernel': ('linear', 'rbf', 'poly'), 'C': [1, 10, 100], 'degree': [2]}
    
    gridsearch = GridSearchCV(svm.SVC(), parameters)
    gridsearch.fit(X_train, y_train)
    
    best_kernel, best_c = gridsearch.best_params_['kernel'], gridsearch.best_params_['C']
    print(gridsearch.best_params_)
    best_model = svm.SVC(kernel=best_kernel, C=best_c)
    
    if best_kernel == 'linear':
        
        print('\nw = ',gridsearch.coef_)
        print('\nb = ',gridsearch.intercept_)
        print('\nIndices of support vectors = ', gridsearch.support_)
        print('\nSupport vectors = ', gridsearch.support_vectors_)
        print('\nNumber of support vectors for each class = ', gridsearch.n_support_)
        print('\nCoefficients of the support vector in the decision function = ', np.abs(gridsearch.dual_coef_))        
    
    elif best_kernel == 'poly':
        best_degree = gridsearch.best_params_['degree']
        best_model = svm.SVC(kernel=best_kernel, C=best_c, degree=best_degree)
    
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)
    
    return make_report(X_test, y_test, y_pred, best_model, "SVM", file_name, reporting_df)

### KNN

#### KNN Regressor

In [15]:
def perform_knn_regressor(X_train, y_train, X_test, y_test, file_name, reporting_df):
    
    print("\nPerforming KNN Regression\n")
    
    parameters = {"n_neighbors": range(1, 50), "weights": ["uniform", "distance"]}
    gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
    gridsearch.fit(X_train, y_train)
    
    best_n_neighbors, best_weight_technique = gridsearch.best_params_['n_neighbors'], gridsearch.best_params_['weights']
    print(gridsearch.best_params_)
    best_model = KNeighborsRegressor(n_neighbors=best_n_neighbors, weights=best_weight_technique)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)
    
    return make_report(X_test, y_test, y_pred, best_model, "KNN Regressor", file_name, reporting_df)

#### KNN Classifier

In [16]:
def perform_knn_classifier(X_train, y_train, X_test, y_test, file_name, reporting_df):
    
    print("\nPerforming KNN Classification\n")
    
    parameters = {"n_neighbors": range(1, 50), "weights": ["uniform", "distance"]}
    
    gridsearch = GridSearchCV(KNeighborsClassifier(), parameters)
    gridsearch.fit(X_train, y_train)

    print("{} Features used during classification: {}".format(gridsearch.n_features_in_, gridsearch.feature_names_in_))

    best_n_neighbors, best_weight_technique = gridsearch.best_params_['n_neighbors'], gridsearch.best_params_['weights']
    print(gridsearch.best_params_)
    best_model = KNeighborsClassifier(n_neighbors=best_n_neighbors, weights=best_weight_technique)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)
    
    return make_report(X_test, y_test, y_pred, best_model, "KNN Classifier", file_name, reporting_df)

### RandomForest

#### Random Forest Regressor

In [17]:
def perform_random_forest_regressor(X_train, y_train, X_test, y_test, file_name, reporting_df, n_estimators=1000):
    
    print("\nPerforming Random Forest Regression\n")
    
    rf = RandomForestRegressor(n_estimators = n_estimators, random_state = random_state)
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)
    y_pred = np.round(predictions).astype(int)
    
    # Get numerical feature importances
    importances = list(rf.feature_importances_)

    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train.columns, importances)]

    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

    return make_report(X_test, y_test, y_pred, rf, "RandomForest Regressor", file_name, reporting_df)

#### Random Forest Classifier

In [18]:
def perform_random_forest_classifier(data_features, data_labels):
    
    print("\nPerforming Random Forest Classification\n")

    # get a list of models to evaluate
    def get_models():
        models = dict()
        # explore ratios from 10% to 100% in 10% increments
        for i in np.arange(0.1, 1.1, 0.1):
            key = '%.1f' % i
            # set max_samples=None to use 100%
            if i == 1.0:
                i = None
            models[key] = RandomForestClassifier(max_samples=i)
        return models
    
    models = get_models()
    for name, model in models.items():
        print('\n>{}'.format(name))
        perform_stratified_sampling(data_features, data_labels, model)

### Gaussian Naive-Bayes

In [19]:
def perform_gaussian_naive_bayes(X_train, y_train, X_test, y_test, file_name, reporting_df):
    
    print("\nPerforming Gaussian Naive Bayes\n")
    
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)

    y_pred = gnb.predict(X_test)
    
    return make_report(X_test, y_test, y_pred, gnb, "Gaussian Naive Bayes", file_name, reporting_df)

### Read Data Files (xlsx) and run models

In [20]:
files = glob.glob("{}/{}/*.xlsx".format(data_folder_loc, folder_to_read_for_input_files))

for file_path in files:
    
    file_name = file_path.split("/")[-1]
    data = load_dataset(file_path)
    data_features, data_labels = data.drop('ActivationLevel', axis=1), data.ActivationLevel
    X_train, X_test, y_train, y_test = perform_train_test_split(data_features, data_labels)
    print("Shape of data: X_train: {}, y_train: {}, X_test: {}, y_test: {}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    
    reporting_df = perform_logistic_regression(X_train, y_train, X_test, y_test, file_name, reporting_df)
    reporting_df = perform_svm(X_train, y_train, X_test, y_test, file_name, reporting_df)
    reporting_df = perform_knn_regressor(X_train, y_train, X_test, y_test, file_name, reporting_df)
    reporting_df = perform_knn_classifier(X_train, y_train, X_test, y_test, file_name, reporting_df)
    reporting_df = perform_random_forest_regressor(X_train, y_train, X_test, y_test, file_name, reporting_df)
    perform_random_forest_classifier(data_features, data_labels)
    reporting_df = perform_gaussian_naive_bayes(X_train, y_train, X_test, y_test, file_name, reporting_df)
            
reporting_df


Loading File RFE15.xlsx

Performing 70.0/30.0 Train-Test Split
Shape of data: X_train: (3596, 20), y_train: (3596,), X_test: (1542, 20), y_test: (1542,)

Performing Logistic Regression

         Current function value: 0.445712
         Iterations: 35

Summary for Logit Model:

                                   Results: Logit
Model:                     Logit                 Pseudo R-squared:      0.147      
Dependent Variable:        ActivationLevel       AIC:                   4620.1414  
Date:                      2023-03-01 03:43      BIC:                   4751.0298  
No. Observations:          5138                  Log-Likelihood:        -2290.1    
Df Model:                  19                    LL-Null:               -2683.8    
Df Residuals:              5118                  LLR p-value:           9.7172e-155
Converged:                 0.0000                Scale:                 1.0000     
No. Iterations:            35.0000                                                

Variable: ChestTube            Importance: 0.22
Variable: EmergentIntubation   Importance: 0.14
Variable: O2                   Importance: 0.09
Variable: SPINE                Importance: 0.06
Variable: BVM                  Importance: 0.06
Variable: BvsPIdx_Penetrating  Importance: 0.06
Variable: STAB                 Importance: 0.05
Variable: IO                   Importance: 0.05
Variable: ETT                  Importance: 0.05
Variable: SUCK                 Importance: 0.05
Variable: ICP                  Importance: 0.04
Variable: Craniotomy           Importance: 0.04
Variable: LMA                  Importance: 0.03
Variable: NEEDLE               Importance: 0.03
Variable: ORAL                 Importance: 0.02

Finding Classification Report


Saving model RandomForest Regressor 


Performing Random Forest Classification


>0.1
Accuracy: 0.806 (0.012)

>0.2
Accuracy: 0.807 (0.012)

>0.3
Accuracy: 0.807 (0.014)

>0.4
Accuracy: 0.806 (0.015)

>0.5
Accuracy: 0.806 (0.013)

>0.6
Accuracy: 0

Accuracy: 0.803 (0.014)

>1.0
Accuracy: 0.802 (0.014)

Performing Gaussian Naive Bayes


Finding Classification Report


Saving model Gaussian Naive Bayes 



Unnamed: 0,File Name,Model,Accuracy,Precision_Recall,R-Square,Adjusted-R-Square
0,RFE15.xlsx,Logistic Regression,0.820363,"{'0': {'precision': 0.8346344925479063, 'recal...",-1.2793,-0.101595
1,RFE15.xlsx,SVM,0.815824,"{'0': {'precision': 0.8192360163710778, 'recal...",-2.930567,-0.129433
2,RFE15.xlsx,KNN Regressor,0.813878,"{'0': {'precision': 0.8223911541119557, 'recal...",-2.219394,-0.141364
3,RFE15.xlsx,KNN Classifier,0.815824,"{'0': {'precision': 0.8277310924369747, 'recal...",-1.690108,-0.129433
4,RFE15.xlsx,RandomForest Regressor,0.819715,"{'0': {'precision': 0.8478581979320532, 'recal...",-0.684041,-0.105572
5,RFE15.xlsx,Gaussian Naive Bayes,0.789883,"{'0': {'precision': 0.845679012345679, 'recall...",-0.567073,-0.288508
6,chi15.xlsx,Logistic Regression,0.817121,"{'0': {'precision': 0.8307475317348378, 'recal...",-1.473065,-0.117805
7,chi15.xlsx,SVM,0.820363,"{'0': {'precision': 0.8304134548002803, 'recal...",-1.602809,-0.097985
8,chi15.xlsx,KNN Regressor,0.813878,"{'0': {'precision': 0.8188653451811346, 'recal...",-2.829084,-0.137624
9,chi15.xlsx,KNN Classifier,0.815824,"{'0': {'precision': 0.8205479452054795, 'recal...",-2.657935,-0.125732


In [21]:
reporting_df.to_excel("{}/{}/{}".format(data_folder_loc, folder_to_save_model, 'ModelOutput.xlsx'), index=False)

### T-SNE

In [22]:
# # That's an impressive list of imports.
# import numpy as np
# from numpy import linalg
# from numpy.linalg import norm
# from scipy.spatial.distance import squareform, pdist

# # We import sklearn.
# import sklearn
# from sklearn.manifold import TSNE
# from sklearn.datasets import load_digits
# from sklearn.preprocessing import scale

# # We'll hack a bit with the t-SNE code in sklearn 0.15.2.
# from sklearn.metrics.pairwise import pairwise_distances
# from sklearn.manifold._t_sne import (_joint_probabilities,
#                                     _kl_divergence)
# # from sklearn.utils.extmath import _ravel
# # Random state.
# RS = 20150101

# # We'll use matplotlib for graphics.
# import matplotlib.pyplot as plt
# import matplotlib.patheffects as PathEffects
# import matplotlib
# %matplotlib inline

# # We import seaborn to make nice plots.
# import seaborn as sns
# sns.set_style('darkgrid')
# sns.set_palette('muted')
# sns.set_context("notebook", font_scale=1.5,
#                 rc={"lines.linewidth": 2.5})

# # We first reorder the data points according to the handwritten numbers.
# X = np.vstack([all_vars_data_dupl[all_vars_data_dupl.FinalNFTI==i].drop(['FinalNFTI', 'MRN', 'LastName', 'EDArrival', 'ISS', 'PHHoTN', 'FinalIntwithin3', 'EDHoTN', 'TFIR', 'ICP', 'Craniotomy'], axis=1)
#                for i in range(2)])
# y = np.hstack([all_vars_data_dupl[all_vars_data_dupl.FinalNFTI==i].FinalNFTI
#                for i in range(2)])

# digits_proj = TSNE(random_state=RS).fit_transform(X)

# def scatter(x, colors):
#     # We choose a color palette with seaborn.
#     palette = np.array(sns.color_palette("hls", 2))

#     # We create a scatter plot.
#     f = plt.figure(figsize=(8, 8))
#     ax = plt.subplot(aspect='equal')
#     sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40,
#                     c=palette[colors.astype(np.int)])
#     plt.xlim(-25, 25)
#     plt.ylim(-25, 25)
#     ax.axis('off')
#     ax.axis('tight')

#     # We add the labels for each digit.
#     txts = []
#     for i in range(2):
#         # Position of each label.
#         xtext, ytext = np.median(x[colors == i, :], axis=0)
#         txt = ax.text(xtext, ytext, str(i), fontsize=24)
#         txt.set_path_effects([
#             PathEffects.Stroke(linewidth=5, foreground="w"),
#             PathEffects.Normal()])
#         txts.append(txt)

#     return f, ax, sc, txts

# scatter(digits_proj, y)
# # plt.savefig('images/digits_tsne-generated.png', dpi=120)