<a href="https://colab.research.google.com/github/ReidelVichot/LC_identification/blob/main/census_cleaning_11_18_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Problem Definition

Background

There seems to be a relationship between the agglomeration of logistical activiey and air pollution. To assess this relationship, the author will use a difference in difference natural experiment design with a synthetic control group. The level of analysis is US contigous counties from 1998 to 2022.

Problem

To conduct a synthetic control group analysis, we need to have a set of covariates and controls to construct this synthetic control group. Using information from the US census, we can construct a set of variables that includes covariates and control variables for the analysis.
I will get variables for each county and each year, including total population, share of white, share of male, age groups, industries, and time of commuting.

#Data Collection

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# -- import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
# -- set directory path
dpath = "/content/drive/MyDrive/Disertation/Census/"
fname = "DECENNIALDPSF42000.DP1-Data.csv"
# -- create dataframes
decennial_00 = pd.read_csv(dpath + fname, skiprows=1)
# -- add year variable
decennial_00["year"] = 2000

#Data Cleaning

##Decennial 2000

In [26]:
# -- create GEOID
decennial_00["GEOID"] = decennial_00.Geography.str[-5:]

# -- select necesary columns
cols = ['GEOID','Geography', 'Geographic Area Name', 'Race/Ethnic Group',
       'Population Groups', 'Number!!Total population',
       'Number!!Total population!!SEX AND AGE!!Male',
       'Number!!Total population!!SEX AND AGE!!Female',
       'Number!!Total population!!SEX AND AGE!!Under 5 years',
       'Number!!Total population!!SEX AND AGE!!5 to 9 years',
       'Number!!Total population!!SEX AND AGE!!10 to 14 years',
       'Number!!Total population!!SEX AND AGE!!15 to 19 years',
       'Number!!Total population!!SEX AND AGE!!20 to 24 years',
       'Number!!Total population!!SEX AND AGE!!25 to 34 years',
       'Number!!Total population!!SEX AND AGE!!35 to 44 years',
       'Number!!Total population!!SEX AND AGE!!45 to 54 years',
       'Number!!Total population!!SEX AND AGE!!55 to 59 years',
       'Number!!Total population!!SEX AND AGE!!60 to 64 years',
       'Number!!Total population!!SEX AND AGE!!65 to 74 years',
       'Number!!Total population!!SEX AND AGE!!75 to 84 years',
       'Number!!Total population!!SEX AND AGE!!85 years and over',
       'Number!!Total population!!SEX AND AGE!!Median age (years)',
       'Number!!HOUSEHOLDS BY TYPE!!Households',
       'Number!!HOUSEHOLDS BY TYPE!!Households!!Average household size',
       'year']

decennial_00 = decennial_00[cols]

In [27]:
# -- rename columns for simplicity
renamed_columns = {'Number!!Total population': "total_population",
                   'Number!!Total population!!SEX AND AGE!!Male': "male",
                   'Number!!Total population!!SEX AND AGE!!Female': "female",
                   'Number!!Total population!!SEX AND AGE!!Under 5 years': "under_5_years",
                   'Number!!Total population!!SEX AND AGE!!5 to 9 years': '5_to_9_years',
                   'Number!!Total population!!SEX AND AGE!!10 to 14 years': '10_to_14_years',
                   'Number!!Total population!!SEX AND AGE!!15 to 19 years': '15_to_19_years',
                   'Number!!Total population!!SEX AND AGE!!20 to 24 years': '20_to_24_years',
                   'Number!!Total population!!SEX AND AGE!!25 to 34 years': '25_to_34_years',
                   'Number!!Total population!!SEX AND AGE!!35 to 44 years': '35_to_44_years',
                   'Number!!Total population!!SEX AND AGE!!45 to 54 years': '45_to_54_years',
                   'Number!!Total population!!SEX AND AGE!!55 to 59 years': '55_to_59_years',
                   'Number!!Total population!!SEX AND AGE!!60 to 64 years': '60_to_64_years',
                   'Number!!Total population!!SEX AND AGE!!65 to 74 years': '65_to_74_years',
                   'Number!!Total population!!SEX AND AGE!!75 to 84 years': '75_to_84_years',
                   'Number!!Total population!!SEX AND AGE!!85 years and over': '85_years_and_over',
                   'Number!!Total population!!SEX AND AGE!!Median age (years)': 'median_age',
                   'Number!!HOUSEHOLDS BY TYPE!!Households': 'households',
                   'Number!!HOUSEHOLDS BY TYPE!!Households!!Average household size': 'average_household_size'}
decennial_00.rename(columns=renamed_columns, inplace=True)

# -- create age groups
decennial_00["Age < 15"] = decennial_00['under_5_years'] + decennial_00['5_to_9_years'] + decennial_00['10_to_14_years']
decennial_00["Age 15-24"] = decennial_00['15_to_19_years'] + decennial_00['20_to_24_years']
decennial_00["Age 25-44"] = decennial_00['25_to_34_years'] + decennial_00['35_to_44_years']
decennial_00["Age 45-64"] = decennial_00['45_to_54_years'] + decennial_00['55_to_59_years'] + decennial_00['60_to_64_years']
decennial_00["Age >= 65"] = decennial_00['65_to_74_years'] + decennial_00['75_to_84_years'] + decennial_00['85_years_and_over']

# -- drop unnecesary columns
decennial_00.drop(columns= ['under_5_years', '5_to_9_years', '10_to_14_years',
                            '15_to_19_years', '20_to_24_years', '25_to_34_years',
                            '35_to_44_years', '45_to_54_years', '55_to_59_years',
                            '60_to_64_years', '65_to_74_years', '75_to_84_years',
                            '85_years_and_over'], inplace=True)

# -- create a variable for white population
white = decennial_00[decennial_00["Race/Ethnic Group"] == 2][["total_population"]].values.copy()

# -- add variable to the decennial dataframe
decennial_00 = decennial_00[decennial_00['Race/Ethnic Group'] == 1].reset_index(drop=True)
decennial_00["white"] = white
del white

decennial_00.drop(columns=['Geography', 'Geographic Area Name',
                           'Race/Ethnic Group', 'Population Groups'], inplace=True)

# -- organize columns
cols = ['GEOID', 'year', 'total_population', 'white', 'male', 'female', 'median_age',
       'households', 'average_household_size', 'Age < 15', 'Age 15-24',
       'Age 25-44', 'Age 45-64', 'Age >= 65']
decennial_00 = decennial_00[cols]


Unnamed: 0,GEOID,year,total_population,white,male,female,median_age,households,average_household_size,Age < 15,Age 15-24,Age 25-44,Age 45-64,Age >= 65
0,1001,2000,43671,34960,21177,22494,35.1,15972,2.72,10468,5614,13304,9843,4442
1,1003,2000,140415,122349,68682,71733,39.1,55356,2.5,28271,16134,39485,34851,21674
2,1005,2000,29038,14909,14980,14058,36.0,10432,2.52,6036,4106,8610,6371,3915
3,1007,2000,20826,15928,10721,10105,35.1,7383,2.65,4439,2847,6347,4779,2414
4,1009,2000,51024,48098,25370,25654,36.3,19153,2.63,10927,6395,14930,12310,6462


##ACS 2010

In [20]:
fname = "ACSST1Y2010.S0101-Data.csv"
acs1_10 = pd.read_csv(dpath + fname, skiprows=1)
fname = "ACSST5Y2010.S0101-Data.csv"
acs5_10 = pd.read_csv(dpath + fname, skiprows=1)

In [21]:
# -- create GEOID
acs5_10["GEOID"] = acs5_10.Geography.str[-5:]
# -- create year
acs5_10["year"] = 2010

# -- drop columns
cols = ['GEOID', 'year',
        'Total!!Estimate!!Total population',
        'Male!!Estimate!!Total population',
        'Female!!Estimate!!Total population',
        'Total!!Estimate!!AGE!!Under 5 years',
        'Total!!Estimate!!AGE!!5 to 9 years',
        'Total!!Estimate!!AGE!!10 to 14 years',
        'Total!!Estimate!!AGE!!15 to 19 years',
        'Total!!Estimate!!AGE!!20 to 24 years',
        'Total!!Estimate!!AGE!!25 to 29 years',
        'Total!!Estimate!!AGE!!30 to 34 years',
        'Total!!Estimate!!AGE!!35 to 39 years',
        'Total!!Estimate!!AGE!!40 to 44 years',
        'Total!!Estimate!!AGE!!45 to 49 years',
        'Total!!Estimate!!AGE!!50 to 54 years',
        'Total!!Estimate!!AGE!!55 to 59 years',
        'Total!!Estimate!!AGE!!60 to 64 years',
        'Total!!Estimate!!AGE!!65 to 69 years',
        'Total!!Estimate!!AGE!!70 to 74 years',
        'Total!!Estimate!!AGE!!75 to 79 years',
        'Total!!Estimate!!AGE!!80 to 84 years',
        'Total!!Estimate!!AGE!!85 years and over',
        'Total!!Estimate!!SUMMARY INDICATORS!!Median age (years)']

acs5_10 = acs5_10[cols]
# -- rename columns for simplicity
renamed_columns = {'Total!!Estimate!!Total population': "total_population",
                   'Male!!Estimate!!Total population': "male",
                   'Female!!Estimate!!Total population': "female",
                   'Total!!Estimate!!AGE!!Under 5 years': "under_5_years",
                   'Total!!Estimate!!AGE!!5 to 9 years': '5_to_9_years',
                   'Total!!Estimate!!AGE!!10 to 14 years': '10_to_14_years',
                   'Total!!Estimate!!AGE!!15 to 19 years': '15_to_19_years',
                   'Total!!Estimate!!AGE!!20 to 24 years': '20_to_24_years',
                   'Total!!Estimate!!AGE!!25 to 29 years': '25_to_29_years',
                   'Total!!Estimate!!AGE!!30 to 34 years': '30_to_34_years',
                   'Total!!Estimate!!AGE!!35 to 39 years': '35_to_39_years',
                   'Total!!Estimate!!AGE!!40 to 44 years': '40_to_44_years',
                   'Total!!Estimate!!AGE!!45 to 49 years': '45_to_49_years',
                   'Total!!Estimate!!AGE!!50 to 54 years': '50_to_54_years',
                   'Total!!Estimate!!AGE!!55 to 59 years': '55_to_59_years',
                   'Total!!Estimate!!AGE!!60 to 64 years': '60_to_64_years',
                   'Total!!Estimate!!AGE!!65 to 69 years': '65_to_69_years',
                   'Total!!Estimate!!AGE!!70 to 74 years': '70_to_74_years',
                   'Total!!Estimate!!AGE!!75 to 79 years': '75_to_79_years',
                   'Total!!Estimate!!AGE!!80 to 84 years': '80_to_84_years',
                   'Total!!Estimate!!AGE!!85 years and over': '85_years_and_over',
                   'Total!!Estimate!!SUMMARY INDICATORS!!Median age (years)': 'median_age'}
acs5_10.rename(columns=renamed_columns, inplace=True)

# -- create age groups
acs5_10["Age < 15"] = acs5_10['under_5_years'] + acs5_10['5_to_9_years'] + acs5_10['10_to_14_years']
acs5_10["Age 15-24"] = acs5_10['15_to_19_years'] + acs5_10['20_to_24_years']
acs5_10["Age 25-44"] = acs5_10['25_to_29_years'] + acs5_10['30_to_34_years'] + acs5_10['35_to_39_years'] + acs5_10['40_to_44_years']
acs5_10["Age 45-64"] = acs5_10['45_to_49_years'] + acs5_10['50_to_54_years'] + acs5_10['55_to_59_years'] + acs5_10['60_to_64_years']
acs5_10["Age >= 65"] = acs5_10['65_to_69_years'] + acs5_10['70_to_74_years'] + acs5_10['75_to_79_years'] + acs5_10['80_to_84_years'] + acs5_10['85_years_and_over']


# -- drop unnecesary columns
acs5_10.drop(columns=['under_5_years', '5_to_9_years', '10_to_14_years',
                      '15_to_19_years', '20_to_24_years', '25_to_29_years',
                      '30_to_34_years', '35_to_39_years', '40_to_44_years',
                      '45_to_49_years', '50_to_54_years', '55_to_59_years',
                      '60_to_64_years', '65_to_69_years', '70_to_74_years',
                      '75_to_79_years', '80_to_84_years', '85_years_and_over'], inplace=True)

In [29]:
acs5_10_dp05 = pd.read_csv(dpath + "ACSDP5Y2010.DP05-Data.csv", skiprows=1, low_memory=False)

In [31]:
acs5_10.head()

Unnamed: 0,GEOID,year,total_population,male,female,median_age,Age < 15,Age 15-24,Age 25-44,Age 45-64,Age >= 65
0,1001,2010,53155,25780,27375,36.2,22.1,14.1,27.4,24.9,11.5
1,1003,2010,175791,85902,89889,41.0,19.3,11.6,24.6,28.1,16.5
2,1005,2010,27699,14652,13047,38.0,18.4,13.4,27.3,27.0,13.9
3,1007,2010,22610,12162,10448,38.3,19.1,14.3,28.1,26.2,12.5
4,1009,2010,56692,28080,28612,38.3,20.5,12.4,26.5,26.5,14.3


In [36]:
# -- add white variable from ACS DP05
acs5_10_dp05 = pd.read_csv(dpath + "ACSDP5Y2010.DP05-Data.csv", skiprows=1, low_memory=False)
acs5_10_dp05 = acs5_10_dp05[['Geography',"Estimate!!RACE!!White"]]
acs5_10_dp05['GEOID'] = acs5_10_dp05.Geography.str[-5:]
acs5_10_dp05.drop(columns=['Geography'], inplace=True)
acs5_10_dp05.rename(columns={'Estimate!!RACE!!White': 'white'}, inplace=True)

acs5_10 = acs5_10.merge(acs5_10_dp05, on='GEOID').copy()
del acs5_10_dp05
acs5_10.head()



Unnamed: 0,GEOID,year,total_population,male,female,median_age,Age < 15,Age 15-24,Age 25-44,Age 45-64,Age >= 65,white_x,white_y,white
0,1001,2010,53155,25780,27375,36.2,22.1,14.1,27.4,24.9,11.5,42758,42758,42758
1,1003,2010,175791,85902,89889,41.0,19.3,11.6,24.6,28.1,16.5,153434,153434,153434
2,1005,2010,27699,14652,13047,38.0,18.4,13.4,27.3,27.0,13.9,13972,13972,13972
3,1007,2010,22610,12162,10448,38.3,19.1,14.3,28.1,26.2,12.5,19054,19054,19054
4,1009,2010,56692,28080,28612,38.3,20.5,12.4,26.5,26.5,14.3,54543,54543,54543


In [41]:
acs5_11 = pd.read_csv(dpath + "ACSDP5Y2011.DP05-Data.csv", skiprows=1, low_memory=False)

In [43]:
# -- create GEOID
acs5_11["GEOID"] = acs5_11.Geography.str[-5:]
# -- create year
acs5_11["year"] = 2011

# -- drop columns
cols = ['GEOID', 'year',
         'Estimate!!RACE!!White',
         'Estimate!!SEX AND AGE!!Under 5 years',
         'Estimate!!SEX AND AGE!!5 to 9 years',
         'Estimate!!SEX AND AGE!!10 to 14 years',
         'Estimate!!SEX AND AGE!!15 to 19 years',
         'Estimate!!SEX AND AGE!!18 years and over',
         'Estimate!!SEX AND AGE!!18 years and over',
         'Estimate!!SEX AND AGE!!20 to 24 years',
         'Estimate!!SEX AND AGE!!21 years and over',
         'Estimate!!SEX AND AGE!!25 to 34 years',
         'Estimate!!SEX AND AGE!!35 to 44 years',
         'Estimate!!SEX AND AGE!!45 to 54 years',
         'Estimate!!SEX AND AGE!!55 to 59 years',
         'Estimate!!SEX AND AGE!!60 to 64 years',
         'Estimate!!SEX AND AGE!!65 to 74 years',
         'Estimate!!SEX AND AGE!!65 years and over',
         'Estimate!!SEX AND AGE!!Female',
         'Estimate!!SEX AND AGE!!Male',
         'Estimate!!SEX AND AGE!!Median age (years)',
         'Estimate!!SEX AND AGE!!Total population',
         'Estimate!!Total housing units']
acs5_11 = acs5_11[cols]
acs5_11.head()


Unnamed: 0,GEOID,year,Estimate!!RACE!!White,Estimate!!SEX AND AGE!!10 to 14 years,Estimate!!SEX AND AGE!!15 to 19 years,Estimate!!SEX AND AGE!!18 years and over,Estimate!!SEX AND AGE!!18 years and over.1,Estimate!!SEX AND AGE!!20 to 24 years,Estimate!!SEX AND AGE!!21 years and over,Estimate!!SEX AND AGE!!25 to 34 years,...,Estimate!!SEX AND AGE!!Female,Estimate!!SEX AND AGE!!Female.1,Estimate!!SEX AND AGE!!Female.2,Estimate!!SEX AND AGE!!Male,Estimate!!SEX AND AGE!!Male.1,Estimate!!SEX AND AGE!!Male.2,Estimate!!SEX AND AGE!!Median age (years),Estimate!!SEX AND AGE!!Total population,Estimate!!SEX AND AGE!!Under 5 years,Estimate!!Total housing units
0,1001,2011,43366,4209,4484,39236,39236,3007,36973,6466,...,27770,27770,27770,26174,26174,26174,36.4,53944,3568,21859
1,1003,2011,157048,11989,11343,137929,137929,9262,131818,20594,...,91970,91970,91970,87553,87553,87553,41.4,179523,11109,102978
2,1005,2011,13956,1540,1743,21413,21413,1833,20512,3836,...,12897,12897,12897,14649,14649,14649,38.3,27546,1669,11903
3,1007,2011,17458,1439,1427,17383,17383,1593,16680,2735,...,10898,10898,10898,11848,11848,11848,39.1,22746,1307,8933
4,1009,2011,55182,4251,3912,43080,43080,3182,40967,6939,...,28783,28783,28783,28357,28357,28357,38.8,57140,3601,23728


In [None]:
acs5_11.head()