In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

https://www.childhealthdata.org/docs/default-source/nsch-docs/2020-2021-nsch-content-map_child-and-family-health-measures_cahmi_8-1-22.pdf?sfvrsn=87175c17_2

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Include target in column list???  "HEIGHT", "WEIGHT",
# This is based on a subsection of the Survey - C. Health Care Services: Health Care Visits,Height and Weight,....
# Developmental Concerns & Screening, Usual Source of Care, Vision Testing, Dental Health Care, Mental Health Care and Other Types of Care, ....
# Forgone Health Care, ER Use, Education & Developmetnal Services, Care Coordination.

columns = [
"K2Q35A",
"S4Q01",
"DOCPRIVATE",
"K4Q20R",
"DOCROOM",
"WGTCONC",
"OVERWEIGHT",
"K6Q10",
"K6Q12",
"K6Q13A",
"K6Q13B",
"K6Q14A",
"K6Q14B",
"K4Q01",
"K4Q02_R",
"USUALGO",
"USUALSICK",
"K4Q31_R",
"K4Q32X01",
"K4Q32X02",
"K4Q32X03",
"K4Q32X04",
"K4Q32X05",
"K4Q30_R",
"DENTISTVISIT",
"DENTALSERV1",
"DENTALSERV2",
"DENTALSERV3",
"DENTALSERV4",
"DENTALSERV5",
"DENTALSERV6",
"DENTALSERV7",
"K4Q22_R",
"TREATNEED",
"K4Q23",
"K4Q24_R",
"K4Q26",
"ALTHEALTH",
"K4Q27",
"K4Q28X01",
"K4Q28X02",
"K4Q28X03",
"K4Q28X04",
"K4Q28X05",
"K4Q28X_EAR",
"NOTELIG",
"AVAILABLE",
"APPOINTMENT",
"TRANSPORTCC",
"NOTOPEN",
"ISSUECOST",
"C4Q04",
"HOSPITALER",
"HOSPITALSTAY",
"K6Q15",
"SESPLANYR",
"SESPLANMO",
"SESCURRSVC",
"K4Q36",
"K4Q37",
"K4Q38",
]

target = ["K2Q35A"]

In [5]:
# Load the data 'C:/Users/tligh/Desktop/Sensory_Needs_Occupational_Therapy/Data/2019-2020 NSCH_Topical_CAHMI_DRC_Original.csv'
file_path = Path('../../Data/2019-2020 NSCH_Topical_CAHMI_DRC_Original.csv')
#file_path = Path('2019-2020 NSCH_Topical_CAHMI_DRC_Original.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['K2Q35A'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
#df['int_rate'] = df['int_rate'].str.replace('%', '')
#df['int_rate'] = df['int_rate'].astype('float') / 100

######
# Convert the target column values to mild_severity, moderate_severity, severe_severity
# mild_severity, moderate_severity and severe_severity are based on their values
# Code = 1 Mild, 2 Moderate, 3 Severe

# mild_severity
#x = {'K2Q35C': '1'}   
#df = df.replace(x)

# moderate_severity is
#x = {'K2Q35C': '2'}   
#df = df.replace(x)

# severe_severity
#x = {'K2Q35C': '3'}   
#df = df.replace(x)

#df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,K2Q35A,S4Q01,DOCPRIVATE,K4Q20R,DOCROOM,WGTCONC,OVERWEIGHT,K6Q10,K6Q12,K6Q13A,...,C4Q04,HOSPITALER,HOSPITALSTAY,K6Q15,SESPLANYR,SESPLANMO,SESCURRSVC,K4Q36,K4Q37,K4Q38
0,2,2,95,95,95,3,2,90,90,90,...,1,1,2,2,95,95,95,2,95,95
1,2,1,90,1,2,3,2,2,2,95,...,1,1,2,2,95,95,95,2,95,95
2,2,1,1,2,1,3,2,90,90,90,...,1,1,2,2,95,95,95,2,95,95
3,2,1,1,2,2,3,2,90,90,90,...,2,1,2,2,95,95,95,2,95,95
4,2,1,90,2,2,1,2,1,2,95,...,1,1,2,2,95,95,95,2,95,95


# Split the Data into Training and Testing

In [6]:
# Define the Features Set
X = df.copy()
X = df.drop(columns="K2Q35A")
X = pd.get_dummies(X)

In [7]:
X.describe()

Unnamed: 0,S4Q01,DOCPRIVATE,K4Q20R,DOCROOM,WGTCONC,OVERWEIGHT,K6Q10,K6Q12,K6Q13A,K6Q13B,...,C4Q04,HOSPITALER,HOSPITALSTAY,K6Q15,SESPLANYR,SESPLANMO,SESCURRSVC,K4Q36,K4Q37,K4Q38
count,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,...,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0,72210.0
mean,1.27438,60.435854,15.1846,15.150796,3.20727,2.311522,65.206218,67.705526,89.564354,89.564437,...,1.732225,1.563232,2.297189,2.291691,81.837723,93.469686,81.033375,2.929179,79.864866,79.190832
std,3.709259,42.204461,32.141317,32.447697,6.117791,6.070185,39.699576,38.575679,12.717787,12.717211,...,7.184525,6.096515,5.645145,6.552453,31.771912,13.272445,33.394697,10.290459,33.898251,35.100251
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,1.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,90.0,90.0,...,1.0,1.0,2.0,2.0,95.0,95.0,95.0,2.0,95.0,95.0
50%,1.0,90.0,2.0,2.0,3.0,2.0,90.0,90.0,90.0,90.0,...,1.0,1.0,2.0,2.0,95.0,95.0,95.0,2.0,95.0,95.0
75%,1.0,90.0,3.0,3.0,3.0,2.0,90.0,90.0,95.0,95.0,...,1.0,1.0,2.0,2.0,95.0,95.0,95.0,2.0,95.0,95.0
max,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0


In [8]:
# Create our target
y = df[target]

In [9]:
# Check the balance of our target values
y.value_counts()

K2Q35A
2         69784
1          2136
99          290
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

The next three set of cells came from 17.7.2 Predict Loan Application Approval

In [11]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(54157, 60)
(18053, 60)
(54157, 1)
(18053, 1)


In [12]:
# Splitting into Train and Test sets into an 80/20 split.- Can choose this option
# X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [13]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

17.7.3 Make Predictions and Evaluate Results

In [14]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [16]:
# Evaluate the Model
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
#cm_df = pd.DataFrame(
    #cm, index=(["Actual 0", "Actual 1"], columns=(["Predicted 0", "Predicted 1"]))

cm

array([[  165,   352,     2],
       [  487, 16871,   104],
       [    6,    65,     1]], dtype=int64)

In [17]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9437212651636847

In [18]:
#Calculating the balanced accuracy scored
bal_acc_score = balanced_accuracy_score(y_test, predictions)
bal_acc_score 

0.43265434787827256

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


array([[  165,   352,     2],
       [  487, 16871,   104],
       [    6,    65,     1]], dtype=int64)

Accuracy Score : 0.9437212651636847
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.25      0.32      0.97      0.28      0.56      0.29       519
          2       0.98      0.97      0.29      0.97      0.53      0.30     17462
         99       0.01      0.01      0.99      0.01      0.12      0.01        72

avg / total       0.95      0.94      0.32      0.95      0.53      0.30     18053



# Ensemble Learners 17.8.1

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [20]:
pip install -U imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
Note: you may need to restart the kernel to use updated packages.


In [21]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [22]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6199755932084695

In [23]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 462,   23,   34],
       [2326, 9658, 5478],
       [  16,   26,   30]], dtype=int64)

In [24]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.16      0.89      0.87      0.28      0.88      0.77       519
          2       0.99      0.55      0.92      0.71      0.71      0.49     17462
         99       0.01      0.42      0.69      0.01      0.54      0.28        72

avg / total       0.97      0.56      0.91      0.70      0.72      0.50     18053



In [25]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE
importances = brf.feature_importances_
columns = X.columns

df = pd.DataFrame({"feature":columns, "importance":importances}).sort_values("importance", ascending = False).reset_index(drop=True)
df.head()

Unnamed: 0,feature,importance
0,K4Q37,0.060135
1,K4Q36,0.057587
2,K4Q38,0.049408
3,SESCURRSVC,0.046499
4,SESPLANYR,0.044732


### Easy Ensemble AdaBoost Classifier

In [26]:
# Train the EasyEnsembleClassifier
# YOUR CODE HERE
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [27]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6512012621540793

In [28]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 434,   29,   56],
       [2036, 9083, 6343],
       [  12,   17,   43]], dtype=int64)

In [29]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.17      0.84      0.88      0.29      0.86      0.74       519
          2       0.99      0.52      0.92      0.68      0.69      0.46     17462
         99       0.01      0.60      0.64      0.01      0.62      0.38        72

avg / total       0.97      0.53      0.92      0.67      0.70      0.47     18053

