In [1]:
# Import dependencies.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from config import db_password
from sqlalchemy import create_engine
import psycopg2

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Connect to PostgreSQL and load data into dataframe.

# Database engine connection.
db_string = f'postgresql://postgres:{db_password}@127.0.0.1:5432/Sensory_Needs_Occupational_Therapy'
    
# Create the database engine.
engine = create_engine(db_string)
    
# Opening a connection.
connection = engine.connect()

# 95 corresponds to a child under the age of 3.
# 99 corresponds to missing data.
# 2 corresponds to a child who was diagnosed, but no longer has the condition.

ML_DF = pd.read_sql('SELECT * FROM "NSCH_Data_ML_Subset"', connection)
ML_DF = ML_DF.drop(['index'], axis=1)

ML_DF = ML_DF[ML_DF['AutismInd_1920'] != 95]
ML_DF = ML_DF[ML_DF['AutismInd_1920'] != 99]
ML_DF = ML_DF[ML_DF['AutismInd_1920'] != 2]

In [3]:
# Drop columns that directly mention an autism diagnosis, severity, treatment, or medication for the X values.
# Set the target to the AutismInd_1920 column to predict a diagnosis.

X = ML_DF.drop(columns=['autism_1920', 'AutismInd_1920'], axis=1)
y = ML_DF['AutismInd_1920']

In [4]:
# Split the data into training and testing data sets.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Check balances.

print(Counter(y_train))
print(Counter(y_test))

Counter({1: 45764, 3: 1474})
Counter({1: 15254, 3: 492})


In [5]:
# Resample the training data with the BalancedRandomForestClassifier.

from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [6]:
# Calculated the balanced accuracy score.

y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8621928834340133

In [7]:
# Display the confusion matrix.

cm = confusion_matrix(y_test, y_pred)
cm

array([[12724,  2530],
       [   54,   438]], dtype=int64)

In [8]:
# Print the imbalanced classification report.

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      0.83      0.89      0.91      0.86      0.74     15254
          3       0.15      0.89      0.83      0.25      0.86      0.75       492

avg / total       0.97      0.84      0.89      0.89      0.86      0.74     15746



In [13]:
# List the features sorted in descending order by feature importance.
# These rankings will be used to determine what features to focus on for the final machine learning model.

features_rank = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

DevDelay_1920: (0.33475791382591297)
DevDelSev_1920: (0.192645756540032)
anxiety_1920: (0.16342976570039766)
LearnSev_1920: (0.12785378683084578)
speech_1920: (0.11686831332967591)
PlayWell_1920: (0.016815996828472367)
CalmDown_1920: (0.015724617977007825)
LearnThings_1920: (0.012090693310198701)
WorkToFin_1920: (0.01201921337963551)
UsePencil_1920: (0.00779394227782124)


In [9]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [10]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8673385416166998

In [11]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[13005,  2249],
       [   58,   434]], dtype=int64)

In [12]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      0.85      0.88      0.92      0.87      0.75     15254
          3       0.16      0.88      0.85      0.27      0.87      0.75       492

avg / total       0.97      0.85      0.88      0.90      0.87      0.75     15746

