In [0]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import requests
from sklearn.neighbors import NearestNeighbors

In [5]:
data_county = pd.read_csv('CountyData.csv')

change_keys = {}
for key in data_county.keys():
    change_keys[key] = key.lower()
data_county = data_county.rename(columns=change_keys)

data_county = data_county.rename(columns={'year4':'year'})
data_county.head()

Unnamed: 0,surveyyr,year,yearofdata,id,idchanged,state_code,type_code,county,name,fips_code_state,...,emp_retire_sec_mortgages,emp_retire_sec_misc_inv,emp_retire_sec_oth_nong,unemp_comp_cash___sec,unemp_comp_bal_in_us_trs,unemp_comp_other_balance,nonin_trust_cash___sec,sinking_fd_cash___sec,bond_fd_cash___sec,oth_nonin_fd_cash___sec
0,16,2016,,11002002,,1,1,2,BALDWIN COUNTY,1,...,0,0,0,0,0,0,95519,14789,0,80730
1,16,2016,,11005005,,1,1,5,BLOUNT COUNTY,1,...,0,0,0,0,0,0,11092,0,0,11092
2,16,2016,,11008008,,1,1,8,CALHOUN COUNTY,1,...,0,0,0,0,0,0,27871,0,0,27871
3,16,2016,,11010010,,1,1,10,CHEROKEE COUNTY,1,...,0,0,0,0,0,0,4074,340,0,3734
4,16,2016,,11015015,,1,1,15,CLEBURNE COUNTY,1,...,0,0,0,0,0,0,4259,72,0,4187


In [0]:
def get_missing(df):
  missing = (~df.fillna(0).astype(bool)).sum()/df.shape[0]
  return missing

def get_keys(df, match):
    return [key for key in df.keys() if match.lower() in key.lower()]
  
def drop_keys(df, keys):
    return df.drop(columns=keys)  

In [8]:
print(get_keys(data_county, 'year'))
print(get_keys(data_county, 'yr'))
print(get_keys(data_county, 'id'))
print(get_keys(data_county, 'code'))
print(get_keys(data_county, 'fips'))

['year', 'yearofdata', 'yearpop', 'st_debt_end_of_year']
['surveyyr', 'unemp_payroll_tax']
['id', 'idchanged', 'individual_income_tax', 'local_igr_interschool_aid', 'chg_solid_waste_mgmt', 'tot_assist___subsidies']
['state_code', 'type_code', 'fips_code_state', 'schlevcode', 'functioncode']
['fips_code_state', 'fips_county', 'fips_place']


In [10]:
keys_to_drop = ['yearofdata', 'yearpop',
                'surveyyr', 
                'idchanged', 
                'state_code', 'type_code', 'fips_code_state', 'schlevcode', 'functioncode',
                'fips_code_state', 'fips_county', 'fips_place'
               ]
data_county = drop_keys(data_county, keys_to_drop)
data_county.head()

Unnamed: 0,year,id,county,name,fyenddate,population,enrollment,total_revenue,total_rev_own_sources,general_revenue,...,emp_retire_sec_mortgages,emp_retire_sec_misc_inv,emp_retire_sec_oth_nong,unemp_comp_cash___sec,unemp_comp_bal_in_us_trs,unemp_comp_other_balance,nonin_trust_cash___sec,sinking_fd_cash___sec,bond_fd_cash___sec,oth_nonin_fd_cash___sec
0,2016,11002002,2,BALDWIN COUNTY,930.0,203709,,108855,90526,108855,...,0,0,0,0,0,0,95519,14789,0,80730
1,2016,11005005,5,BLOUNT COUNTY,930.0,57673,,13019,8801,13019,...,0,0,0,0,0,0,11092,0,0,11092
2,2016,11008008,8,CALHOUN COUNTY,930.0,115620,,44992,29859,44992,...,0,0,0,0,0,0,27871,0,0,27871
3,2016,11010010,10,CHEROKEE COUNTY,930.0,25859,,30577,24749,30577,...,0,0,0,0,0,0,4074,340,0,3734
4,2016,11015015,15,CLEBURNE COUNTY,930.0,15018,,7925,4507,7925,...,0,0,0,0,0,0,4259,72,0,4187


In [12]:
data_county = drop_keys(data_county, ['county', 'fyenddate'])
data_county.head()

Unnamed: 0,year,id,name,population,enrollment,total_revenue,total_rev_own_sources,general_revenue,gen_rev_own_sources,total_taxes,...,emp_retire_sec_mortgages,emp_retire_sec_misc_inv,emp_retire_sec_oth_nong,unemp_comp_cash___sec,unemp_comp_bal_in_us_trs,unemp_comp_other_balance,nonin_trust_cash___sec,sinking_fd_cash___sec,bond_fd_cash___sec,oth_nonin_fd_cash___sec
0,2016,11002002,BALDWIN COUNTY,203709,,108855,90526,108855,90526,58300,...,0,0,0,0,0,0,95519,14789,0,80730
1,2016,11005005,BLOUNT COUNTY,57673,,13019,8801,13019,8801,8589,...,0,0,0,0,0,0,11092,0,0,11092
2,2016,11008008,CALHOUN COUNTY,115620,,44992,29859,44992,29859,17424,...,0,0,0,0,0,0,27871,0,0,27871
3,2016,11010010,CHEROKEE COUNTY,25859,,30577,24749,30577,24749,9248,...,0,0,0,0,0,0,4074,340,0,3734
4,2016,11015015,CLEBURNE COUNTY,15018,,7925,4507,7925,4507,2371,...,0,0,0,0,0,0,4259,72,0,4187


In [28]:
data_county['id'].nunique()*data_county['year'].nunique(),  data_county.shape[0]

(146928, 104151)

In [29]:
data_county[['id', 'name']].nunique()

id      3061
name    1814
dtype: int64

In [30]:
data_county.shape

(104151, 581)

In [32]:
data_county.head()

Unnamed: 0,year,id,name,population,enrollment,total_revenue,total_rev_own_sources,general_revenue,gen_rev_own_sources,total_taxes,...,emp_retire_sec_mortgages,emp_retire_sec_misc_inv,emp_retire_sec_oth_nong,unemp_comp_cash___sec,unemp_comp_bal_in_us_trs,unemp_comp_other_balance,nonin_trust_cash___sec,sinking_fd_cash___sec,bond_fd_cash___sec,oth_nonin_fd_cash___sec
0,2016,11002002,BALDWIN COUNTY,203709,,108855,90526,108855,90526,58300,...,0,0,0,0,0,0,95519,14789,0,80730
1,2016,11005005,BLOUNT COUNTY,57673,,13019,8801,13019,8801,8589,...,0,0,0,0,0,0,11092,0,0,11092
2,2016,11008008,CALHOUN COUNTY,115620,,44992,29859,44992,29859,17424,...,0,0,0,0,0,0,27871,0,0,27871
3,2016,11010010,CHEROKEE COUNTY,25859,,30577,24749,30577,24749,9248,...,0,0,0,0,0,0,4074,340,0,3734
4,2016,11015015,CLEBURNE COUNTY,15018,,7925,4507,7925,4507,2371,...,0,0,0,0,0,0,4259,72,0,4187


In [33]:
missing = get_missing(data_county)
missing.describe()

count    581.000000
mean       0.753202
std        0.325927
min        0.000000
25%        0.617872
50%        0.929977
75%        0.992722
max        1.000000
dtype: float64

# Here give it to Lohith. Ask him to fill in whatever he can. And then take good_attributes.

In [0]:
data_county.to_csv('RECleaned_CountyData1.csv', index=False)

In [47]:
pd.read_csv('RECleaned_CountyData1.csv').head()

Unnamed: 0,year,id,name,population,enrollment,total_revenue,total_rev_own_sources,general_revenue,gen_rev_own_sources,total_taxes,...,emp_retire_sec_mortgages,emp_retire_sec_misc_inv,emp_retire_sec_oth_nong,unemp_comp_cash___sec,unemp_comp_bal_in_us_trs,unemp_comp_other_balance,nonin_trust_cash___sec,sinking_fd_cash___sec,bond_fd_cash___sec,oth_nonin_fd_cash___sec
0,2016,11002002,BALDWIN COUNTY,203709,,108855,90526,108855,90526,58300,...,0,0,0,0,0,0,95519,14789,0,80730
1,2016,11005005,BLOUNT COUNTY,57673,,13019,8801,13019,8801,8589,...,0,0,0,0,0,0,11092,0,0,11092
2,2016,11008008,CALHOUN COUNTY,115620,,44992,29859,44992,29859,17424,...,0,0,0,0,0,0,27871,0,0,27871
3,2016,11010010,CHEROKEE COUNTY,25859,,30577,24749,30577,24749,9248,...,0,0,0,0,0,0,4074,340,0,3734
4,2016,11015015,CLEBURNE COUNTY,15018,,7925,4507,7925,4507,2371,...,0,0,0,0,0,0,4259,72,0,4187


In [44]:
good_attributes = missing[missing<.1].index.tolist()
print(len(good_attributes))
good_attributes[:5]

55


['year', 'id', 'name', 'population', 'total_revenue']

In [35]:
supported_attributes = [
  "Total_Revenue",
  "Total_Rev_Own_Sources",
  "Total_Taxes",
  "Total_Gen_Sales_Tax",
  "Total_Select_Sales_Tax",
  "Total_License_Taxes",
  "Motor_Vehicle_License_Total",
  "Total_Income_Taxes",
  "Total_IG_Revenue",
  "Total_Fed_IG_Revenue",
  "Total_State_IG_Revenue",
  "Total_General_Charges",
  "Chg_Total_Education",
  "Chg_Total_Elem_Education",
  "Chg_Total_High_Ed",
  "Chg_Total_Nat_Res",
  "Prop_Sale_Total",
  "Total_Utility_Revenue",
  "Total_Insur_Trust_Rev",
  "Total_Insur_Trust_Ctrb",
  "Total_Emp_Ret_Rev",
  "Emp_Ret_Total_Ctrib",
  "Total_Unemp_Rev",
  "Total_Expenditure",
  "Total_IG_Expenditure",
  "Total_Current_Expend",
  "Total_Current_Oper",
  "Total_Capital_Outlays",
  "Total_Construction",
  "Total_Other_Capital_Outlays",
  "Total_Interest_on_Debt",
  "Total_Insur_Trust_Ben",
  "Total_Salaries___Wages",
  "Air_Trans_Total_Expend",
  "Correct_Total_Exp",
  "Total_Educ_Total_Exp",
  "Total_Educ_Direct_Exp",
  "Total_Educ_Assist___Sub",
  "Total_Educ_Cap_Outlay",
  "Total_Educ_Current_Exp",
  "Total_Educ_Construct",
  "Elem_Educ_Total_Exp",
  "Higher_Ed_Total_Exp",
  "Educ_NEC_Total_Expend",
  "Fin_Admin_Total_Exp",
  "Fire_Prot_Total_Expend",
  "Judicial_Total_Expend",
  "Cen_Staff_Total_Expend",
  "Gen_Pub_Bldg_Total_Exp",
  "Health_Total_Expend",
  "Total_Hospital_Total_Exp",
  "Total_Hospital_Dir_Exp",
  "Total_Hospital_Cap_Out",
  "Total_Hospital_Current_Exp",
  "Total_Hospital_Construct",
  "Total_Hospital_IG_Loc_Govts",
  "Own_Hospital_Total_Exp",
  "Hosp_Other_Total_Exp",
  "Total_Highways_Tot_Exp",
  "Total_Highways_Dir_Exp",
  "Total_Highways_Cap_Out",
  "Total_Highways_Current_Exp",
  "Total_Highways_Construct",
  "Regular_Hwy_Total_Exp",
  "Toll_Hwy_Total_Expend",
  "Transit_Sub_Total_Exp",
  "Hous___Com_Total_Exp",
  "Libraries_Total_Expend",
  "Natural_Res_Total_Exp",
  "Parking_Total_Expend",
  "Parks___Rec_Total_Exp",
  "Police_Prot_Total_Exp",
  "Prot_Insp_Total_Exp",
  "Public_Welf_Total_Exp",
  "Welf_Categ_Total_Exp",
  "Welf_Cash_Total_Exp",
  "Welf_Ins_Total_Exp",
  "Welf_NEC_Total_Expend",
  "Sewerage_Total_Expend",
  "SW_Mgmt_Total_Expend",
  "Water_Trans_Total_Exp",
  "General_NEC_Total_Exp",
  "Total_Util_Total_Exp",
  "Total_Util_Inter_Exp",
  "Total_Util_Cap_Outlay",
  "Total_Util_Current_Exp",
  "Total_Util_Construct",
  "Water_Util_Total_Exp",
  "Elec_Util_Total_Exp",
  "Gas_Util_Total_Exp",
  "Trans_Util_Total_Exp",
  "Emp_Ret_Total_Expend",
  "Unemp_Comp_Total_Exp",
  "Total_Debt_Outstanding",
  "Total_Long_Term_Debt_Out",
  "Total_Beg_LTD_Out",
  "Total_LTD_Issued",
  "Total_LTD_Iss_FFC",
  "Total_LTD_Iss_NG",
  "Total_LTD_Retired",
  "Total_LTD_Ret_FFC",
  "Total_LTD_Ret_NG",
  "Total_LTD_Out",
  "Total_LTD_Out_Utility",
  "Total_LTD_Out_FFC",
  "Total_Cash___Securities",
  "Emp_Retire_Total_Sec"
  ]
supported_attributes = [key.lower() for key in supported_attributes]
len(supported_attributes)

107

In [42]:
ask_lohith = [key for key in supported_attributes if key not in good_attributes]
len(ask_lohith)

80

In [0]:
missing[ask_lohith].sort_values()

total_hospital_construct       0.111058
air_trans_total_expend         0.137274
total_beg_ltd_out              0.205434
welf_ins_total_exp             0.273117
own_hospital_total_exp         0.292183
total_ltd_ret_ng               0.296473
welf_cash_total_exp            0.312202
total_ltd_iss_ng               0.314109
sewerage_total_expend          0.374643
sw_mgmt_total_expend           0.407531
total_ltd_out_ffc              0.408484
total_ltd_ret_ffc              0.408961
toll_hwy_total_expend          0.475214
total_util_total_exp           0.479504
total_util_current_exp         0.487131
total_ltd_iss_ffc              0.514299
water_trans_total_exp          0.525739
total_util_cap_outlay          0.594376
transit_sub_total_exp          0.606768
total_hospital_ig_loc_govts    0.648713
total_utility_revenue          0.648713
hosp_other_total_exp           0.661582
trans_util_total_exp           0.681602
total_util_construct           0.683031
water_util_total_exp           0.769781
