# 1. Data Cleaning

In [1]:
import pandas as pd 

# 2. elsect_summary

In [None]:
financeSum = './data/data_raw/elsect_summary.csv'
df = pd.read_csv(financeSum)

## 2.1 NaNs

In [None]:
# drop 1992 data, all NaNs
df = df.dropna(how = 'any').reset_index(drop = True)

## 2.2 Missleading column TOTAL_EXPENDITURE

In [None]:
# EXPENDITURE subcategories do not sum up to total, creat new columns
## Summed up expenditures from subcategories 
df['TOTAL_KNOWN_EXPENDITURE'] = df[[ 'INSTRUCTION_EXPENDITURE', 'SUPPORT_SERVICES_EXPENDITURE',
       'OTHER_EXPENDITURE', 'CAPITAL_OUTLAY_EXPENDITURE']].sum(axis = 1)
## The difference btw total expenditures and summed up expenditures
df['UNKNOWN_EXPENDITURE'] = df.TOTAL_EXPENDITURE - df.TOTAL_KNOWN_EXPENDITURE

## 2.3 New feature: revenue per enrollment

In [5]:
# New feature: REV_PER_ENROLL revenue per enrollment
df['REV_PER_ENROLL'] = df['TOTAL_REVENUE'] / df['ENROLL']

In [6]:
df.head()

Unnamed: 0,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,SUPPORT_SERVICES_EXPENDITURE,OTHER_EXPENDITURE,CAPITAL_OUTLAY_EXPENDITURE,TOTAL_KNOWN_EXPENDITURE,UNKNOWN_EXPENDITURE,REV_PER_ENROLL
0,Alabama,1993,727716.0,2827391,331409,1729295,766687,2833433,1564558,794146,237222.0,204207,2800133.0,33300.0,3.885295
1,Alaska,1993,121156.0,1191398,176150,775829,239419,1126398,494917,433788,36291.0,135791,1100787.0,25611.0,9.833586
2,Arizona,1993,676297.0,3427976,318465,1415407,1694104,3623946,1578889,1000914,164083.0,680139,3424025.0,199921.0,5.068743
3,Arkansas,1993,311432.0,1346909,128196,771079,447634,1376067,782791,386526,68617.0,97824,1335758.0,40309.0,4.32489
4,California,1993,5129788.0,28043338,2151157,17064146,8828035,28110986,15281147,8914559,1608514.0,1944760,27748980.0,362006.0,5.466764


In [13]:
df.to_csv('./data/data_cleaned/edu_finance_cleaned.csv', index = False)

# 3. US Demographics

In [19]:
file1 = './data/data_raw/POP1990.csv'
file2 = './data/data_raw/POP2000.csv'
file3 = './data/data_raw/est_POP2010_2015.csv'
df_1990 = pd.read_csv(file1, usecols=['year', 'stname', 'tot_pop'])
df_2000 = pd.read_csv(file2, usecols=['year', 'stname', 'tot_pop'])
df_2010_est = pd.read_csv(file3, encoding = "ISO-8859-1", usecols=['STNAME','CTYNAME', 'POPESTIMATE2011', 
                                                                  'POPESTIMATE2012', 'POPESTIMATE2013',
                                                                  'POPESTIMATE2014', 'POPESTIMATE2015' ])
colnames = ['stname', 'ctyname', '2011', '2012', '2013', '2014', '2015']

## 3.1 Group county level data to states and years

In [20]:
df_1990 = df_1990.groupby(['stname', 'year']).sum().reset_index()
df_2000 = df_2000.groupby(['stname', 'year']).sum().reset_index()

## 3.2 Rename columns

In [21]:
df_2010_est.columns = colnames

## 3.3 Extract state level data only

In [22]:
df_2010_est = df_2010_est[df_2010_est.stname == df_2010_est.ctyname].drop('ctyname', axis = 1).reindex()

In [23]:
df_2010_est.head()

Unnamed: 0,stname,2011,2012,2013,2014,2015
0,Alabama,4801108,4816089,4830533,4846411,4858979
68,Alaska,722720,731228,737442,737046,738432
98,Arizona,6468732,6553262,6630799,6728783,6828065
114,Arkansas,2938538,2949499,2957957,2966835,2978204
190,California,37700034,38056055,38414128,38792291,39144818


## 3.4 Melt year columns to single column

In [24]:
df_2010_est = df_2010_est.melt(id_vars='stname', value_vars=['2011', '2012', '2013', '2014', '2015'],
                               var_name='year', value_name='tot_pop')
df_2010_est.head()

Unnamed: 0,stname,year,tot_pop
0,Alabama,2011,4801108
1,Alaska,2011,722720
2,Arizona,2011,6468732
3,Arkansas,2011,2938538
4,California,2011,37700034


## 3.5 Combine three data sets into a single dataframe

In [25]:
df_pop = pd.concat([df_1990, df_2000, df_2010_est], axis = 0).sort_values(['stname', 'year'])
df_pop.year = df_pop.year.astype(int)

In [27]:
df_pop.head(30)

Unnamed: 0,stname,year,tot_pop
0,Alabama,1990,4152384
1,Alabama,1991,4877290
2,Alabama,1992,4517525
3,Alabama,1993,4449453
4,Alabama,1994,4665300
5,Alabama,1995,4986229
6,Alabama,1996,5101961
7,Alabama,1997,4955683
8,Alabama,1998,4490646
9,Alabama,1999,4685143


In [28]:
df_pop.to_csv('./data/data_cleaned/us_demographics_cleaned.csv', index = False)