In [1]:
# import dependencies
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import data to DataFrame
# Load the data
file_path = os.path.relpath('./mhcld-puf-2020-csv.csv')
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,YEAR,AGE,EDUC,ETHNIC,RACE,GENDER,SPHSERVICE,CMPSERVICE,OPISERVICE,RTCSERVICE,...,ODDFLG,PDDFLG,PERSONFLG,SCHIZOFLG,ALCSUBFLG,OTHERDISFLG,STATEFIP,DIVISION,REGION,CASEID
0,2020,14,-9,4,3,1,1,1,1,2,...,0,0,0,1,0,0,1,6,3,20200000001
1,2020,1,2,4,1,1,2,1,2,2,...,0,0,0,0,0,1,1,6,3,20200000002
2,2020,6,-9,4,4,2,2,1,1,2,...,0,0,0,0,0,0,1,6,3,20200000003
3,2020,14,-9,4,3,2,1,1,2,2,...,0,0,0,1,0,0,1,6,3,20200000004
4,2020,13,-9,4,3,2,1,1,1,2,...,0,0,1,1,0,1,1,6,3,20200000005
5,2020,12,2,4,2,1,2,1,2,2,...,0,0,0,0,0,0,1,6,3,20200000006
6,2020,9,4,4,6,1,1,2,2,2,...,0,0,0,1,0,0,1,6,3,20200000007
7,2020,12,4,3,1,1,2,1,1,2,...,0,0,0,0,0,0,1,6,3,20200000008
8,2020,10,5,4,3,2,2,1,1,2,...,0,0,1,0,0,0,1,6,3,20200000009
9,2020,8,5,3,5,1,2,1,1,2,...,0,0,1,0,1,0,1,6,3,20200000010


In [3]:
# columns to drop: 
# Year - all the same
# Education (due to excessive NaNs)
# Ethnicity - mostly just "not hispanic"
# SPHSERVICE - location of service unlikely to be a driver
# CMPSERVICE 
# OPISERVICE 
# RTCSERVICE 
# IJSSERVICE 
# MH1, MH2, AND MH3 - Redundant with target if the diagnosis is anxiety
# SUB (substance abuse is covered by SAP) 
# MARSTAT (proportion of married people seems incorrect)
# EMPLOY (over 63% missing)
# DETNLF
# VETERAN 62% missing
# LIVARAG 52% "private residence" not very descriptive; 40% missing
# CONDUCTFLG
# DELIRDEMFLG
# set aside ANXIETYFLAG for use as the target
# ODDFLG
# PDDFLG
# PERSONFLG
# ALCSUBFLG
# OTHERDISFLG
# STATEFIP - Used geographic data for graphing instead of ML
# DIVISION
# REGION
ml_df = df.drop(columns=['YEAR', 'EDUC', 'ETHNIC', 'SPHSERVICE', 'CMPSERVICE', 'OPISERVICE',
                        'RTCSERVICE', 'IJSSERVICE', 'MH1', 'MH2', 'MH3', 'SUB', 'MARSTAT', 'EMPLOY',
                        'DETNLF', 'VETERAN', 'LIVARAG', 'TRAUSTREFLG', 'CONDUCTFLG',
                        'DELIRDEMFLG','ODDFLG', 'PDDFLG', 'PERSONFLG', 'ALCSUBFLG', 'STATEFIP', 'DIVISION', 'REGION',
                        'OTHERDISFLG'])
ml_df.set_index('CASEID', inplace=True)
ml_df.head(10)

Unnamed: 0_level_0,AGE,RACE,GENDER,SMISED,SAP,NUMMHS,ANXIETYFLG,ADHDFLG,BIPOLARFLG,DEPRESSFLG,SCHIZOFLG
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20200000001,14,3,1,1,1,2,0,0,0,0,1
20200000002,1,1,1,2,2,3,1,1,0,0,0
20200000003,6,4,2,1,1,1,0,0,1,0,0
20200000004,14,3,2,1,2,1,0,0,0,0,1
20200000005,13,3,2,1,1,3,0,0,0,0,1
20200000006,12,2,1,1,2,2,1,0,0,1,0
20200000007,9,6,1,1,1,2,1,0,0,0,1
20200000008,12,1,1,1,1,1,0,0,1,0,0
20200000009,10,3,2,1,2,2,1,0,0,0,0
20200000010,8,5,1,1,1,3,0,0,1,0,0


In [4]:
# Separate target variable from inputs
X = ml_df.copy()
X = X.drop('ANXIETYFLG', axis=1)
y = ml_df['ANXIETYFLG']
y.value_counts()

0    5540524
1    1404997
Name: ANXIETYFLG, dtype: int64

In [5]:
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create Model Instance: logistic regression
classifier = LogisticRegression()


In [7]:
# Fit and train the model
classifier.fit(X_train, y_train)

LogisticRegression()

In [8]:
# Check accuracy scores
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8136141474408444
Testing Data Score: 0.8134689333734935


In [9]:
# Generate confusion matrix
y_true = y_test
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[1308184,   77078],
       [ 246811,  104308]], dtype=int64)

In [10]:
# See if a random forest can do better
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100)
clf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [11]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7521033690186658

In [12]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[984214, 401048],
       [ 29395, 321724]], dtype=int64)

In [13]:
# Or a neural network?
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=10, activation="relu", input_dim=10))
nn_model.add(tf.keras.layers.Dense(units=10, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=10, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                110       
                                                                 
 dense_1 (Dense)             (None, 10)                110       
                                                                 
 dense_2 (Dense)             (None, 10)                110       
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 341
Trainable params: 341
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=10)
# This was tried with 25 epochs initially. Accuracy scores seemed to stop improving after approximately 10 of them.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

54262/54262 - 67s - loss: 0.3149 - accuracy: 0.8339 - 67s/epoch - 1ms/step
Loss: 0.3149416446685791, Accuracy: 0.8339206576347351
