### MODEL DEVELOPMENT

In [9]:
import pandas as pd 
import seaborn as sns 
import statistics
import numpy as np
import matplotlib.pyplot as plt

In [10]:
%matplotlib inline

In [11]:
#Read in cleaned 2019 and 2021 Dataframes with numerical outliers removed
model_train = pd.read_csv('mydata/MMSA2019_Cleaned_2.csv')
model_valid = pd.read_csv('mydata/MMSA2021_Cleaned_2.csv')

In [12]:
model_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83666 entries, 0 to 83665
Data columns (total 97 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DISPCODE  83666 non-null  int64  
 1   HHADULT   83666 non-null  float64
 2   SEXVAR    83666 non-null  object 
 3   GENHLTH   83666 non-null  object 
 4   PHYSHLTH  83666 non-null  float64
 5   MENTHLTH  83666 non-null  float64
 6   _HLTHPLN  83666 non-null  object 
 7   MEDCOST1  83666 non-null  object 
 8   CHECKUP1  83666 non-null  object 
 9   CVDINFR4  83666 non-null  object 
 10  CVDCRHD4  83666 non-null  object 
 11  CVDSTRK3  83666 non-null  object 
 12  ASTHMA3   83666 non-null  object 
 13  CHCSCNCR  83666 non-null  object 
 14  CHCOCNCR  83666 non-null  object 
 15  ADDEPEV3  83666 non-null  object 
 16  CHCKDNY2  83666 non-null  object 
 17  DIABETE4  83666 non-null  object 
 18  MARITAL   83666 non-null  object 
 19  EDUCA     83666 non-null  object 
 20  RENTHOM1  83666 non-null  ob

In [13]:
model_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104619 entries, 0 to 104618
Data columns (total 97 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   DISPCODE  104619 non-null  int64  
 1   HHADULT   104619 non-null  float64
 2   SEXVAR    104619 non-null  object 
 3   GENHLTH   104619 non-null  object 
 4   PHYSHLTH  104619 non-null  float64
 5   MENTHLTH  104619 non-null  float64
 6   _HLTHPLN  104619 non-null  object 
 7   MEDCOST1  104619 non-null  object 
 8   CHECKUP1  104619 non-null  object 
 9   CVDINFR4  104619 non-null  object 
 10  CVDCRHD4  104619 non-null  object 
 11  CVDSTRK3  104619 non-null  object 
 12  ASTHMA3   104619 non-null  object 
 13  CHCSCNCR  104619 non-null  object 
 14  CHCOCNCR  104619 non-null  object 
 15  ADDEPEV3  104619 non-null  object 
 16  CHCKDNY2  104619 non-null  object 
 17  DIABETE4  104619 non-null  object 
 18  MARITAL   104619 non-null  object 
 19  EDUCA     104619 non-null  object 
 20  RENT

**BASE MODELS**

-
-
-
-

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,confusion_matrix

In [18]:
#lets create a new data frame with only dummmy categorical data and num columns 

num_cols = list(model_train.select_dtypes(exclude='object').columns)
cat_cols = list(model_train.select_dtypes(include='object').columns)

dummies_df = model_train[num_cols]
cat_cols = list(cat_cols)

for i in ['GENHLTH','_RFHLTH']:
    cat_cols.remove(i)


for i in cat_cols:
    temp = pd.get_dummies(model_train[i],drop_first=True,prefix=i)
    dummies_df = pd.concat([dummies_df,temp],axis=1)

dummies_df.head()

Unnamed: 0,DISPCODE,HHADULT,PHYSHLTH,MENTHLTH,CPDEMO1B,CHILDREN,WEIGHT2,HEIGHT3,_STSTR,_PHYS14D,...,STATE_South Carolina,STATE_South Dakota,STATE_Tennessee,STATE_Texas,STATE_Utah,STATE_Vermont,STATE_Virginia,STATE_Washington,STATE_West Virginia,STATE_Wisconsin
0,1200,2.0,0.0,0.0,2.0,3.0,180.0,1.778,16049,1,...,0,1,0,0,0,0,0,0,0,0
1,1200,3.0,20.0,0.0,1.0,0.0,265.0,1.8034,16049,3,...,0,1,0,0,0,0,0,0,0,0
2,1200,1.0,1.0,0.0,1.0,0.0,170.0,1.7272,16049,2,...,0,1,0,0,0,0,0,0,0,0
3,1100,2.0,0.0,0.0,1.0,4.0,280.0,1.8288,16039,1,...,0,1,0,0,0,0,0,0,0,0
4,1100,2.0,0.0,0.0,1.0,0.0,270.0,1.9304,16049,1,...,0,1,0,0,0,0,0,0,0,0


In [20]:
dummies_df.columns = dummies_df.columns.astype(str)
model_train.columns = model_train.columns.astype(str)

In [21]:
X = dummies_df
y = model_train['_RFHLTH']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
#Decision tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

predictions = dtree.predict(X_test)

print(classification_report(y_test,predictions))
print('---------------------------------------')
print(confusion_matrix(y_test,predictions))

                       precision    recall  f1-score   support

Good or Better Health       0.90      0.90      0.90     21078
  fair or poor health       0.48      0.49      0.49      4022

             accuracy                           0.83     25100
            macro avg       0.69      0.70      0.69     25100
         weighted avg       0.84      0.83      0.83     25100

---------------------------------------
[[18960  2118]
 [ 2045  1977]]


In [None]:
#Naive Bayes

In [None]:
#RandomForestClassifier

### Can perceived health be accurately predicted using dietary habits, socioeconomic indicators, lifestyle choices, and individual metrics?