In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

## Reading in all datasets

In [None]:
df_sh = pd.read_csv('data/ed_socio_health.csv')
df_wp = pd.read_csv('data/wage_poverty.csv')
df_un = pd.read_csv('data/unemployment_clean.csv')
df_fins = pd.read_csv('data/food_ins_18.csv')
df_ed = pd.read_csv('data/education_stats_dsi.csv')

In [None]:
df_sh.fips.astype(int)
df_wp.fips.astype(int)
df_un.fips.astype(int)
df_fins.fips.astype(int)
df_ed.fips.astype(int).head()

## Merge Dataframes
Merge each dataset into one main df

In [None]:
df_m = pd.merge(left = df_sh, right = df_wp, on = 'fips')
df_m = pd.merge(left = df_m, right = df_un, on = 'fips')
df_m = pd.merge(left = df_m, right = df_fins, on = 'fips')
df_m = pd.merge(left = df_m, right = df_ed, on = 'fips' )
df_m.shape

In [None]:
df_m.info()

In [None]:
df_m.isnull().sum().sum()

In [None]:
#crete csv
df_m.to_csv('data/df_full.csv', index = False)

## Check for Nulls

In [None]:
nulls = pd.DataFrame(df_m.isnull().sum())

In [None]:
nulls = nulls.reset_index()

In [None]:
#nulls.to_csv('./data/nulls.csv', index = False)

## Cleaning

#### Renaming state_x as full_st_name

In [None]:
df_m = df_m.rename(columns = {"state_x": "state_name",
                               "state_y": "state_abr"})

In [None]:
df_m.head()

## Dropping unnecessary columns

Dropping columns that are unlikely to have explanatory power over and above other variables and that are duplicate information from 2016 data (eg. 2019 unemployment data).

Dropping num and percent food insecure in 2016 dataset as the 2018 data has both children and total. 

In [None]:
df_m.shape

In [None]:
df_m.head()

In [None]:
drop_list = ['teen_birth_rate', 'age_adjusted_death_rate', 'child_mortality_rate', 
             'infant_mortality_rate', 'num_limited_access_to_healthy_foods', 
             'segregation_index', 'segregation_index_2', 'homicide_rate', 
             'suicide_rate_age_adjusted', 'juvenile_arrest_rate', 'area_name', 
             'num_below_poverty', 'percent_some_college', 'labor_force', 
             'percent_unemployed_CHR', 'med_inc_19', 'unemployment_rate_2019', 
             'med_household_inc_19', 'med_hh_income_percent_of_state_total_2019', 
             'num_food_insecure', 'percent_food_insecure', 'less_than_high_school_diploma', 
             'bachelor_degree_or_higher', 'percent_less_than_18_years_of_age', 'percent_65_and_over', 
            'mental_health_provider_rate']

df_m = df_m.drop(columns = drop_list)
df_m.shape

In [None]:
df_m.head()

## Drop % and convert to float

In [None]:
df_m['fi_rate_18'] = df_m['fi_rate_18'].str.replace('%', '').astype(float)
df_m['ch_fi_rate_18'] = df_m['ch_fi_rate_18'].str.replace('%', '').astype(float)
df_m['cpm_18'] = df_m['cpm_18'].str.strip('US$').astype(float)
df_m.head()

In [None]:
# Check types
types = pd.DataFrame(df_m.dtypes)
types

## Impute

In [None]:
df_m.isnull().sum()

In [None]:
# Total number of missing values
df_m.isnull().sum().sum()

In [None]:
null_columns = [col for col in df_m if df_m[col].isna().any()]

df_m_mean = df_m.copy()
df_m_median = df_m.copy()
df_m_mode = df_m.copy()
df_m_knn = pd.get_dummies(df_m.copy())
df_m_iterative = pd.get_dummies(df_m.copy())
df_m_lr = pd.get_dummies(df_m.copy())
df_m_rf = pd.get_dummies(df_m.copy())

In [None]:
# Imputer with mean, median, mode
for col in null_columns:
    df_m_mean[col] = df_m[col].fillna(df_m[col].dropna().mean())
    df_m_median[col] = df_m[col].fillna(df_m[col].dropna().median())
    df_m_mode[col] = df_m[col].fillna(df_m[col].dropna().mode()[0])
    
print('Mean imputation nulls: ', df_m_mean.isnull().sum().sum())
print('Median imputation nulls: ', df_m_median.isnull().sum().sum())
print('Mode imputation nulls: ', df_m_mode.isnull().sum().sum())

imp_knn = KNNImputer(n_neighbors = 2)
df_m_knn = imp_knn.fit_transform(df_m_knn)
df_m_knn = pd.DataFrame(df_m_knn)
print('Knn imputation nulls: ', df_m_knn.isnull().sum().sum())

# imp_iterative = IterativeImputer(random_state = 0)
# df_m_iterative = imp_iterative.fit_transform(df_m_iterative)
# df_m_iterative = pd.DataFrame(df_m_iterative)
# print('Iterative imputation nulls: ', df_m_iterative.isnull().sum().sum())

lr = LinearRegression()
rf = RandomForestRegressor()

def impute_missing_data(df, model):
    for col in null_columns:
        df_cc = df.dropna() #use complete case
        
        # Fit model
        X = df_cc.drop(columns = col)
        y = df_cc[col]
        model.fit(X, y)
        
        df_temp = df.copy()
        
        for column in df_temp.columns:
            if column != col:
                df_temp[column] = df_temp[column].fillna(df_temp[column].dropna().median())
                
        X_temp = df_temp.drop(columns = col) #drop target for prediction so there is no nulls

        # Loop through all of the rows checking for nulls in the col column, create a pred, and set that cell equal to pred
        for index, row in df_temp.iterrows():
            if pd.isnull(df_temp[col].iloc[index]):
                X_test_row = X_temp.iloc[index] #use df without target`ii
                X_test_row = X_test_row.values.reshape(1, -1)
                
                pred = model.predict(X_test_row)
                print(pred[0])
                df[col][index] = pred[0]
                
df_m_lr = impute_missing_data(df_m_lr, lr)
print('Lr imputation nulls: ', df_m_lr.isnull().sum().sum())

df_m_rf = impute_missing_data(df_m_rf, rf)
print('Rf imputation nulls: ',df_m_rf.isnull().sum().sum())

In [None]:
# List of Features needing median to be added in place of null values:
features=['percent_low_birthweight','primary_care_physicians_rate','mental_health_provider_rate','high_school_graduation_rate',
'food_environment_index','percent_with_access_to_exercise_opportunities','num_households_CHR','percent_single_parent_households_CHR',
'violent_crime_rate','percent_limited_access_to_healthy_foods','percent_enrolled_in_free_or_reduced_lunch','percent_severe_housing_cost_burden',
'percent_rural']

In [None]:
df_full['percent_low_birthweight'].fillna(df_full['percent_low_birthweight'].median(), inplace=True)
df_full['primary_care_physicians_rate'].fillna(df_full['primary_care_physicians_rate'].median(), inplace=True)
df_full['high_school_graduation_rate'].fillna(df_full['high_school_graduation_rate'].median(), inplace=True)
df_full['percent_with_access_to_exercise_opportunities'].fillna(df_full['percent_with_access_to_exercise_opportunities'].median(), inplace=True)
df_full['num_households_CHR'].fillna(df_full['num_households_CHR'].median(), inplace=True)
df_full['percent_single_parent_households_CHR'].fillna(df_full['percent_single_parent_households_CHR'].median(), inplace=True)
df_full['violent_crime_rate'].fillna(df_full['violent_crime_rate'].median(), inplace=True)
df_full['percent_limited_access_to_healthy_foods'].fillna(df_full['percent_limited_access_to_healthy_foods'].median(), inplace=True)
df_full['percent_enrolled_in_free_or_reduced_lunch'].fillna(df_full['percent_enrolled_in_free_or_reduced_lunch'].median(), inplace=True)
df_full['percent_severe_housing_cost_burden'].fillna(df_full['percent_severe_housing_cost_burden'].median(), inplace=True)
df_full['percent_rural'].fillna(df_full['percent_rural'].median(), inplace=True)
df_full['food_environment_index'].fillna(df_full['food_environment_index'].median(), inplace=True)
df_full['life_expectancy'].fillna(df_full['life_expectancy'].median(), inplace=True)

In [None]:
df_full.isnull().sum()

In [None]:
df_full.shape

In [None]:
df_full.to_csv('./data/df_final.csv', index = False)