In [1]:
# Import dependencies.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Read csv into pandas dataframe and drop rows which have 99, 95, or 2 as the value in column AutismInd_1920.
# 95 corresponds to a child under the age of 3.
# 99 corresponds to missing data.
# 2 corresponds to a child who was diagnosed, but no longer has the condition.

df = pd.read_csv('../Data/Main_Sean_File_Numeric_2019-2020 NSCH_Topical_CAHMI_DRC_1920.csv')

df = df[df['AutismInd_1920'] != 95]
df = df[df['AutismInd_1920'] != 99]
df = df[df['AutismInd_1920'] != 2]

In [3]:
# Drop columns that directly mention an autism diagnosis, severity, treatment, or medication for the X values.
# Set the target to the AutismInd_1920 column to predict a diagnosis.

X = df.drop(columns=['autism_1920', 'AutismSev_1920', 'AutismInd_1920', 'ASDSevInd_1920', 'ASDMed_1920', 'ASDBehTreat_1920', 'ASDAge_1920', 'ASDDrType_1920', 'nom17_3ASD_1920'], axis=1)
y = df['AutismInd_1920']

In [4]:
# Split the data into training and testing data sets.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Check balances.

print(Counter(y_train))
print(Counter(y_test))

Counter({1: 45764, 3: 1474})
Counter({1: 15254, 3: 492})


In [5]:
# Resample the training data with the BalancedRandomForestClassifier.

from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [6]:
# Calculated the balanced accuracy score.

y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9226404429705763

In [7]:
# Display the confusion matrix.

cm = confusion_matrix(y_test, y_pred)
cm

array([[13545,  1709],
       [   21,   471]], dtype=int64)

In [8]:
# Print the imbalanced classification report.

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      0.89      0.96      0.94      0.92      0.84     15254
          3       0.22      0.96      0.89      0.35      0.92      0.86       492

avg / total       0.97      0.89      0.96      0.92      0.92      0.84     15746



In [9]:
# List the features sorted in descending order by feature importance.
# These rankings will be used to determine what features to focus on for the final machine learning model.

features_rank = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

MEDB10ScrQ5_1920: (0.06506409220589073)
CondCnt_1920: (0.060036165293049684)
Cond2more_1920: (0.03401314849138369)
CSHCN_1920: (0.03363869870607571)
qualnum_1920: (0.0325121412027496)
SpSerAge_1920: (0.03105146327365146)
CSHCNComplex_1920: (0.029083486663111825)
qualnum4_1920: (0.028753779974281215)
mhneeds_1920: (0.028127465571766694)
therapy_1920: (0.025692620350377594)
DevDelay_1920: (0.025502427220044227)
SpEdAge_1920: (0.025302930086014575)
CSHCNtype_1920: (0.024116117384432572)
nom17_1CSHCN_1920: (0.02170320942788651)
DevDelSev_1920: (0.0208895801326926)
MakeFriend_1920: (0.018243803683273625)
DiffCare_1920: (0.01769034390502826)
npm11MHCSHCN_1920: (0.016853043692685907)
nom17_2sysCSHCN_1920: (0.01480451907813766)
npm11MHnonCSHCN_1920: (0.013357886695118419)
serve_1920: (0.013277798030574307)
mhealth_1920: (0.013018511763182217)
SpEducPln_1920: (0.012844689282955464)
behavior_1920: (0.010455473166133164)
func_1920: (0.010369523779779082)
DiffMem_1920: (0.009510194819560058)
Diff2