In [None]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import Dependencies 
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Preprocess

In [3]:
# Read in Health Ranking Data
df = pd.read_csv("Data/County_Health_All_Factors.csv")
print(df.shape)
df.head()

(3142, 10)


Unnamed: 0.1,Unnamed: 0,FIPS,State,County,Lifespan_Rank,Life_Quality_Rank,Health_Behaviors_Rank,Clinical_Care_Rank,Social_Economic_Factors_Rank,Physical_Environment_Rank
0,1,1001,Alabama,Autauga,8,5,15,14,5,50
1,2,1003,Alabama,Baldwin,3,4,3,4,3,62
2,3,1005,Alabama,Barbour,14,48,53,30,61,32
3,4,1007,Alabama,Bibb,47,24,38,16,38,31
4,5,1009,Alabama,Blount,36,14,10,41,15,53


In [4]:
# Read in the Merged Data
df2 = pd.read_csv("Data/Merged_Demographic_Data.csv")
print(df2.shape)
df2.head()

(3041, 25)


Unnamed: 0,County,State,FIPS,Lyme Disease Incidence Reported,Population,% < 18 Yrs Old,% 65 Yrs Old and over,Income ($),African American (Count),African American (%),...,Native Hawaiian/Other Pacific Islander (%),Hispanic (Count),Hispanic (%),Non-Hispanic White (Count),Non-Hispanic White (%),Female (%),Rural (Count),Rural (%),Life Expectancy,Deaths (Count)
0,Autauga,Alabama,1001,1.0,55504,23.94,15.12,$58343.00,10687,19.25,...,0.1,1586,2.86,41336,74.47,51.34,22921.0,42.0,76.33,815.0
1,Baldwin,Alabama,1003,0.0,212628,21.85,19.95,$56607.00,19037,8.95,...,0.07,9675,4.55,176582,83.05,51.45,77060.0,42.28,78.6,2827.0
2,Barbour,Alabama,1005,1.0,25270,20.76,18.82,$32490.00,12115,47.94,...,0.19,1063,4.21,11613,45.96,47.23,18613.0,67.79,75.78,451.0
3,Bibb,Alabama,1007,0.0,22668,20.61,16.02,$45795.00,4864,21.46,...,0.11,598,2.64,16842,74.3,46.45,15663.0,68.35,73.93,445.0
4,Blount,Alabama,1009,0.0,58013,23.35,17.84,$48253.00,847,1.46,...,0.12,5549,9.57,50439,86.94,50.69,51562.0,89.95,74.6,1050.0


In [5]:
df_merged = df.merge(df2, how='left', on='FIPS')
print(df_merged.shape)
df_merged.head(10)

(3150, 34)


Unnamed: 0.1,Unnamed: 0,FIPS,State_x,County_x,Lifespan_Rank,Life_Quality_Rank,Health_Behaviors_Rank,Clinical_Care_Rank,Social_Economic_Factors_Rank,Physical_Environment_Rank,...,Native Hawaiian/Other Pacific Islander (%),Hispanic (Count),Hispanic (%),Non-Hispanic White (Count),Non-Hispanic White (%),Female (%),Rural (Count),Rural (%),Life Expectancy,Deaths (Count)
0,1,1001,Alabama,Autauga,8,5,15,14,5,50,...,0.1,1586.0,2.86,41336.0,74.47,51.34,22921.0,42.0,76.33,815.0
1,2,1003,Alabama,Baldwin,3,4,3,4,3,62,...,0.07,9675.0,4.55,176582.0,83.05,51.45,77060.0,42.28,78.6,2827.0
2,3,1005,Alabama,Barbour,14,48,53,30,61,32,...,0.19,1063.0,4.21,11613.0,45.96,47.23,18613.0,67.79,75.78,451.0
3,4,1007,Alabama,Bibb,47,24,38,16,38,31,...,0.11,598.0,2.64,16842.0,74.3,46.45,15663.0,68.35,73.93,445.0
4,5,1009,Alabama,Blount,36,14,10,41,15,53,...,0.12,5549.0,9.57,50439.0,86.94,50.69,51562.0,89.95,74.6,1050.0
5,6,1011,Alabama,Bullock,53,58,65,62,64,24,...,0.78,850.0,8.25,2196.0,21.3,45.53,5607.0,51.37,73.12,205.0
6,7,1013,Alabama,Butler,63,56,56,52,55,8,...,0.05,285.0,1.44,10229.0,51.6,53.43,14921.0,71.23,73.51,393.0
7,8,1015,Alabama,Calhoun,55,10,28,34,27,41,...,0.11,4302.0,3.75,82961.0,72.31,51.93,39955.0,33.7,73.1,2333.0
8,9,1017,Alabama,Chambers,42,53,57,25,37,46,...,0.04,822.0,2.44,18710.0,55.5,52.13,16816.0,49.15,74.12,691.0
9,10,1019,Alabama,Cherokee,43,8,12,54,19,43,...,0.03,425.0,1.64,23707.0,91.69,50.32,22282.0,85.74,74.35,575.0


In [6]:
df_merged.set_index('FIPS', inplace =True)

In [7]:
# Check for null values
for col in df_merged:
    print(f'{col} has {df_merged[col].isnull().sum()} nulls')

Unnamed: 0 has 0 nulls
State_x has 0 nulls
County_x has 0 nulls
Lifespan_Rank has 0 nulls
Life_Quality_Rank has 0 nulls
Health_Behaviors_Rank has 0 nulls
Clinical_Care_Rank has 0 nulls
Social_Economic_Factors_Rank has 0 nulls
Physical_Environment_Rank has 0 nulls
County_y has 109 nulls
State_y has 109 nulls
Lyme Disease Incidence Reported has 109 nulls
Population has 109 nulls
% < 18 Yrs Old has 109 nulls
% 65 Yrs Old and over has 109 nulls
Income ($) has 109 nulls
African American (Count) has 109 nulls
African American (%) has 109 nulls
American Indian/Alaskan Native (Count) has 109 nulls
American Indian/Alaskan Native(%) has 109 nulls
Asian (Count) has 109 nulls
Asian (%) has 109 nulls
Native Hawaiian/Other Pacific Islander (Count) has 109 nulls
Native Hawaiian/Other Pacific Islander (%) has 109 nulls
Hispanic (Count) has 109 nulls
Hispanic (%) has 109 nulls
Non-Hispanic White (Count) has 109 nulls
Non-Hispanic White (%) has 109 nulls
Female (%) has 109 nulls
Rural (Count) has 109 nu

In [8]:
df_merged = df_merged.dropna()

In [9]:
# Check for null values
for col in df_merged:
    print(f'{col} has {df_merged[col].isnull().sum()} nulls')

Unnamed: 0 has 0 nulls
State_x has 0 nulls
County_x has 0 nulls
Lifespan_Rank has 0 nulls
Life_Quality_Rank has 0 nulls
Health_Behaviors_Rank has 0 nulls
Clinical_Care_Rank has 0 nulls
Social_Economic_Factors_Rank has 0 nulls
Physical_Environment_Rank has 0 nulls
County_y has 0 nulls
State_y has 0 nulls
Lyme Disease Incidence Reported has 0 nulls
Population has 0 nulls
% < 18 Yrs Old has 0 nulls
% 65 Yrs Old and over has 0 nulls
Income ($) has 0 nulls
African American (Count) has 0 nulls
African American (%) has 0 nulls
American Indian/Alaskan Native (Count) has 0 nulls
American Indian/Alaskan Native(%) has 0 nulls
Asian (Count) has 0 nulls
Asian (%) has 0 nulls
Native Hawaiian/Other Pacific Islander (Count) has 0 nulls
Native Hawaiian/Other Pacific Islander (%) has 0 nulls
Hispanic (Count) has 0 nulls
Hispanic (%) has 0 nulls
Non-Hispanic White (Count) has 0 nulls
Non-Hispanic White (%) has 0 nulls
Female (%) has 0 nulls
Rural (Count) has 0 nulls
Rural (%) has 0 nulls
Life Expectancy 

In [10]:
df_merged = df_merged.drop(columns=['Unnamed: 0','County_y','State_y','State_x','County_x'])

In [11]:
df_merged['Income ($)'] = df_merged['Income ($)'].str.replace('$', '')
df_merged['Income ($)'] = df_merged['Income ($)'].astype('float') / 100

  """Entry point for launching an IPython kernel.


In [12]:
df_merged = df_merged.astype({'Lifespan_Rank': 'int32', 'Life_Quality_Rank': 'int32','Health_Behaviors_Rank':'int32',
'Clinical_Care_Rank':'int32','Social_Economic_Factors_Rank':'int32','Physical_Environment_Rank':'int32'})

In [13]:
df_merged.dtypes

Lifespan_Rank                                       int32
Life_Quality_Rank                                   int32
Health_Behaviors_Rank                               int32
Clinical_Care_Rank                                  int32
Social_Economic_Factors_Rank                        int32
Physical_Environment_Rank                           int32
Lyme Disease Incidence Reported                   float64
Population                                        float64
% < 18 Yrs Old                                    float64
% 65 Yrs Old and over                             float64
Income ($)                                        float64
African American (Count)                          float64
African American (%)                              float64
American Indian/Alaskan Native (Count)            float64
American Indian/Alaskan Native(%)                 float64
Asian (Count)                                     float64
Asian (%)                                         float64
Native Hawaiia

## Split the Data into Training and Testing

In [20]:
# Create our features
X = df_merged.copy()
X = X.drop(columns='Lyme Disease Incidence Reported', axis=1)
           
# Create our target
y = df_merged[['Lyme Disease Incidence Reported']]
y.value_counts()

Lyme Disease Incidence Reported
0.0                                1886
1.0                                 314
2.0                                 147
3.0                                  81
4.0                                  57
                                   ... 
133.0                                 1
130.0                                 1
123.0                                 1
121.0                                 1
470.0                                 1
Length: 169, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(2265, 27)