# What This Notebook Does
This notebook creates the state level analysis file for the period 1994 to 2023 by combining data from ACS and CPS IPUMS extracts for those years. State level GDP over the corresponding period come from the BEA's [United States Regional Economic Analysis Project](https://united-states.reaproject.org/), which I downloaded manually from their site.

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from ipumspy import readers, ddi
from Credentials import MyCredentials
from Functions import *
from pathlib import Path

# ACS

In [2]:
# Initialize
StateDfAcs = pd.DataFrame()
IndividualDfAcs = pd.DataFrame()

# Create a path object to the data
DataDir = Path(Paths['acs'])

# Construct file list
Files = list(DataDir.glob("*.xml"))

for f in Files:
    
    # Get the Data dictionary
    ddi = readers.read_ipums_ddi(f)
    df = readers.read_microdata(ddi, DataDir / ddi.file_description.filename)
    print('\n**********************************************************************\n' +
    f'Working file {f} corresponding to sample ' + str(df['YEAR'][0]) +
    '\n**********************************************************************\n')
    # Read data in and do some cleaning
    acs_df = (df
            .rename(columns=lambda x: x.lower())
            .pipe(lambda x: x[~x['uhrswork'].isin([0])])          # Dropping all the observations with no hours, or unable to report hours
            .drop(columns=['bpl'])
            .rename(columns={'bpld':'bpl'})
            .assign(bpl = lambda x: x['bpl'].astype(str))         # Change this to string
            .assign(bpl = lambda x: x['bpl'].str.zfill(5))        # Uniform length 5
            .pipe(lambda x: x[~x['bpl'].str[0].isin(['8','9'])])  # Dropping those we can't identify a country of origin for
            .pipe(lambda x: x[x['uhrswork'] >= 35])               # Keep the full time workers
            .pipe(lambda x: x[x['age'] != 999])                   # Missing age
            .pipe(lambda x: x[x['age'] >= 16])                    # Drop if below the age of 16
            .pipe(lambda x: x[x['citizen'] != 9])                 # Drop not in universe for citizen variable
            .assign(incwage = lambda x: x['incwage'].replace([999999,999998],np.NaN))
            .assign(year = lambda x: pd.to_datetime(x['year'],format='%Y'))
            .pipe(lambda x: x[x['citizen'] != 9])                 # Drop the "not in univerese codes"
            .assign(ImmigrantGroup = lambda x: x['bpl'].apply(ImmigrantGroup)) # Assign immigrant groups following Peri 2012
            .assign(foreign = lambda x: (x['citizen'] != 0).astype(int))
            .pipe(lambda x: x.loc[:,['perwt','uhrswork','foreign','statefip','year','ImmigrantGroup','incwage',
                                     'occ','occsoc','educ','age']])
            )

    IndividualDfAcs = pd.concat([IndividualDfAcs,acs_df]) # Save the individual level data
    acs_df = acs_df.drop(columns=['occ','occsoc','educ','age']) # Drop these for the aggregation

    # Create State-ImmigrantGroup table
    acs_collapse = (acs_df
                    .groupby(['ImmigrantGroup','foreign','statefip','year'])
                    .apply(lambda x: pd.Series({
                        'HoursSupplied': np.dot(x['uhrswork'] * 52,x['perwt'])/1e+6,  # Units, millions of hours
                        'BodiesSupplied': np.dot(np.ones(len(x['perwt'])),x['perwt']),
                        'Wage': np.dot(x['incwage'],x['perwt'])/np.dot(np.ones(len(x['perwt'])),x['perwt'])}))
                    .reset_index()
                    )
    
    StateDfAcs = pd.concat([StateDfAcs,acs_collapse])

See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00178.xml corresponding to sample 2000
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00179.xml corresponding to sample 2001
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00180.xml corresponding to sample 2002
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00181.xml corresponding to sample 2003
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00182.xml corresponding to sample 2004
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00183.xml corresponding to sample 2005
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00184.xml corresponding to sample 2006
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00185.xml corresponding to sample 2007
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00186.xml corresponding to sample 2008
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00187.xml corresponding to sample 2009
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00188.xml corresponding to sample 2010
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00189.xml corresponding to sample 2011
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00190.xml corresponding to sample 2012
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00191.xml corresponding to sample 2013
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00192.xml corresponding to sample 2014
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00193.xml corresponding to sample 2015
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00194.xml corresponding to sample 2016
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00195.xml corresponding to sample 2017
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00196.xml corresponding to sample 2018
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00197.xml corresponding to sample 2019
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00198.xml corresponding to sample 2020
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00199.xml corresponding to sample 2021
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/acs/usa_00200.xml corresponding to sample 2022
**********************************************************************



  .apply(lambda x: pd.Series({


# CPS

In [3]:
# Initialize
StateDfCps = pd.DataFrame()
IndividualDfCps = pd.DataFrame()

# Create a path object to the data folder
DataDir = Path(Paths['cps'])

# Construct a list of all files in the CPS folder
Files = list(DataDir.glob("*.xml")) 

for f in Files:

    # Get the Data dictionary
    ddi = readers.read_ipums_ddi(f)
    df = readers.read_microdata(ddi, DataDir / ddi.file_description.filename)
    print('\n**********************************************************************\n' +
    f'Working file {f} corresponding to sample ' + str(df['YEAR'][0]) +
    '\n**********************************************************************\n')

    # Read data in and do some cleaning
    cps_df = (df
            .rename(columns=lambda x: x.lower())
            .pipe(lambda x: x[~x['uhrsworkt'].isin([0,997,999])]) # Dropping all the observations with no hours, or unable to report hours
            .assign(bpl = lambda x: x['bpl'].astype(str))         # Change this to string
            .assign(bpl = lambda x: x['bpl'].str.zfill(5))        # Uniform length 5
            .pipe(lambda x: x[~x['bpl'].isin(['8','9'])])  # Dropping those we can't identify a country of origin for
            .pipe(lambda x: x[x['uhrsworkt'] >= 35])              # Keep the full time workers
            .pipe(lambda x: x[x['age'] >= 16])
            .pipe(lambda x: x[x['citizen'] != 9])                 # Drop not in universe for citizen variable
            .assign(incwage = lambda x: x['incwage'].replace([999999,999998],np.NaN))
            .assign(year = lambda x: pd.to_datetime(x['year'],format='%Y'))
            .pipe(lambda x: x[x['citizen'] != 9])                 # Drop the "not in univerese codes"
            .assign(ImmigrantGroup = lambda x: x['bpl'].apply(ImmigrantGroup)) # Assign immigrant groups following Peri 2012
            .assign(foreign = lambda x: x['citizen'].isin([3,4,5]).astype(int))
            .drop(columns=['cpsid','cpsidv','cpsidp','asecwth','asecflag', 'month','serial','pernum','bpl','citizen',
                            'occ2010','occ1990','ind1990'])
            )
    
    IndividualDfCps = pd.concat([IndividualDfCps,cps_df]) # Save individual data
    cps_df = cps_df.drop(columns=['occ','educ','age'])    # Drop these for the aggreation

    # Create State-ImmigrantGroup table
    cps_collapse = (cps_df
                    .groupby(['ImmigrantGroup','foreign','statefip','year'])
                    .apply(lambda x: pd.Series({
                        'HoursSupplied': np.dot(x['uhrsworkt'] * 52,x['asecwt'])/1e+6,  # Units, millions of hours
                        'BodiesSupplied': np.dot(np.ones(len(x['asecwt'])),x['asecwt']),
                        'Wage': np.dot(x['incwage'],x['asecwt'])/np.dot(np.ones(len(x['asecwt'])),x['asecwt'])})
                        )
                    .reset_index()
                    )
    
    StateDfCps = pd.concat([StateDfCps,cps_collapse])

See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00030.xml corresponding to sample 1994
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00031.xml corresponding to sample 1995
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00032.xml corresponding to sample 1996
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00033.xml corresponding to sample 1997
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00034.xml corresponding to sample 1998
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00035.xml corresponding to sample 1999
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00036.xml corresponding to sample 2023
**********************************************************************



  .apply(lambda x: pd.Series({
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.



**********************************************************************
Working file ../data/cps/cps_00037.xml corresponding to sample 2024
**********************************************************************



  .apply(lambda x: pd.Series({


# Merging ACS, CPS

In [4]:
IndividualDfCps.head()

Unnamed: 0,year,statefip,asecwt,age,occ,uhrsworkt,educ,incwage,ImmigrantGroup,foreign
6,1994-01-01,23,838.36,44,208,40,92,38000,United States,0
10,1994-01-01,23,566.21,51,243,60,73,20800,Rest of Asia,1
15,1994-01-01,23,959.87,28,23,38,111,40000,Other,1
16,1994-01-01,23,893.33,26,23,38,111,40000,United States,0
17,1994-01-01,23,838.36,44,84,99,124,24400,United States,0


In [5]:
IndividualDfAcs.head()

Unnamed: 0,perwt,uhrswork,foreign,statefip,year,ImmigrantGroup,incwage,occ,occsoc,educ,age
7,24.0,50,0,44,2000-01-01,United States,150000,306,291060,11,43
12,31.0,40,0,44,2000-01-01,United States,1500,330,292010,6,19
13,16.0,50,0,44,2000-01-01,United States,42600,570,436010,11,62
18,21.0,42,0,44,2000-01-01,United States,97000,470,411011,5,38
20,29.0,40,1,44,2000-01-01,Rest of Asia,17000,220,251000,11,27


In [7]:
StateDf = pd.concat([StateDfCps,StateDfAcs])
IndividualDf = pd.concat([IndividualDfCps,IndividualDfAcs])
IndividualDf.head()

Unnamed: 0,year,statefip,asecwt,age,occ,uhrsworkt,educ,incwage,ImmigrantGroup,foreign,perwt,uhrswork,occsoc
6,1994-01-01,23,838.36,44,208,40,92,38000,United States,0,,,
10,1994-01-01,23,566.21,51,243,60,73,20800,Rest of Asia,1,,,
15,1994-01-01,23,959.87,28,23,38,111,40000,Other,1,,,
16,1994-01-01,23,893.33,26,23,38,111,40000,United States,0,,,
17,1994-01-01,23,838.36,44,84,99,124,24400,United States,0,,,


In [9]:
StateDf.to_csv(Paths['data'] + '/AcsCpsStateData.csv', index = False) # I will merge some additional state-level variables into this data, so this is not yet the analysis file
IndividualDf.to_csv(Paths['data'] + '/AcsCpsIndividualData.csv', index = False) # Not yet the analysis file, still need to incorporate ONET

# State GDP Data

In [30]:
Gdp63to96 = (pd.read_csv(Paths['gdp'] + '/SAGDP_SIC/SAGDP2S__ALL_AREAS_1963_1997.csv')
             .assign(statefip = lambda x: x['GeoFIPS'].str.replace('"','').str.replace(' ','').str[0:2])
             .pipe(lambda x: x[x['statefip'] != '00'])
             .pipe(lambda x: x[~x['GeoName'].isin([np.NaN])])
             .pipe(lambda x: x[~x['GeoName'].isin(['Far West', 'Rocky Mountain', 'Southwest',
                                                   'Southeast','Plains', 'Great Lakes','Mideast',
                                                   'New England'])])
            .pipe(lambda x: x[x['Description'] == 'All industry total'])
            .drop(columns=['GeoFIPS','Region','TableName','LineCode','IndustryClassification',
                           'Description', 'Unit'])
            .pipe(lambda x: pd.melt(x,id_vars=['GeoName','statefip']))
            .rename(columns={'variable':'year','value':'NGdp'})
            .pipe(lambda x: x[x['year'] != '1997'])
            .assign(year = lambda x: pd.to_datetime(x['year']))
            .assign(NGdp = lambda x: x['NGdp'].astype(float))
)

Gdp97to23 = (pd.read_csv(Paths['gdp'] + '/SAGDP/SAGDP2N__ALL_AREAS_1997_2023.csv')
            .assign(statefip = lambda x: x['GeoFIPS'].str.replace('"','').str.replace(' ','').str[0:2])
            .pipe(lambda x: x[x['statefip'] != '00'])
            .pipe(lambda x: x[~x['GeoName'].isin([np.NaN])])
            .pipe(lambda x: x[~x['GeoName'].isin(['Far West', 'Rocky Mountain', 'Southwest',
                                                   'Southeast','Plains', 'Great Lakes','Mideast',
                                                   'New England'])])
            .pipe(lambda x: x[x['Description'] == 'All industry total '])
            .drop(columns=['GeoFIPS','Region','TableName','LineCode','IndustryClassification',
                           'Description', 'Unit'])
            .pipe(lambda x: pd.melt(x,id_vars=['GeoName','statefip']))
            .rename(columns={'variable':'year','value':'NGdp'})
            .assign(year = lambda x: pd.to_datetime(x['year']))
            .assign(NGdp = lambda x: x['NGdp'].astype(float))
            )
GdpAllYears = pd.concat([Gdp63to96,Gdp97to23]).rename(columns={'GeoName':'StateName'})

In [31]:
GdpAllYears.dtypes

StateName            object
statefip             object
year         datetime64[ns]
NGdp                float64
dtype: object

# Merging GDP, ACS/CPS and State Capital Stock Estimates

In [32]:
AcsCpsDf = (pd.read_csv(Paths['data'] + '/AcsCpsStateData.csv', dtype={'statefip':'object'})
            .assign(statefip = lambda x: x['statefip'].str.zfill(2))
            .assign(year = lambda x: pd.to_datetime(x['year']))
            )
AcsCpsDf.head()

Unnamed: 0,ImmigrantGroup,foreign,statefip,year,HoursSupplied,BodiesSupplied,Wage
0,Canada-Australia-New Zealand,1,1,1994-01-01,8.951427,4148.02,32500.0
1,Canada-Australia-New Zealand,1,2,1994-01-01,6.091535,2714.98,29623.315641
2,Canada-Australia-New Zealand,1,4,1994-01-01,20.506824,9182.54,14849.310878
3,Canada-Australia-New Zealand,1,6,1994-01-01,111.973079,46027.9,44158.675754
4,Canada-Australia-New Zealand,1,8,1994-01-01,33.771626,13001.83,40330.894232


In [33]:
KSeries = (pd.read_stata(Paths['data'] + '/CapitalStockByState.dta')
           .assign(year = lambda x: x['year'].astype(str))
           .assign(year = lambda x: pd.to_datetime(x['year']))
)
KSeries.head()

Unnamed: 0,StateName,year,statefip,K
0,Alabama,1994-01-01,1,159398.3
1,Alaska,1994-01-01,2,58737.85
2,Arizona,1994-01-01,4,226487.6
3,Arkansas,1994-01-01,5,96565.74
4,California,1994-01-01,6,2157399.0


In [34]:
# The year 2024 are the only unmatched observations because Bea hasn't released those estimates yet
AcsCpsBeaMerged = pd.merge(AcsCpsDf,GdpAllYears, on =['year','statefip'], indicator=False, how='left') 
AcsCpsBeaMerged.head()

Unnamed: 0,ImmigrantGroup,foreign,statefip,year,HoursSupplied,BodiesSupplied,Wage,StateName,NGdp
0,Canada-Australia-New Zealand,1,1,1994-01-01,8.951427,4148.02,32500.0,Alabama,90098.5
1,Canada-Australia-New Zealand,1,2,1994-01-01,6.091535,2714.98,29623.315641,Alaska,23604.7
2,Canada-Australia-New Zealand,1,4,1994-01-01,20.506824,9182.54,14849.310878,Arizona,100374.2
3,Canada-Australia-New Zealand,1,6,1994-01-01,111.973079,46027.9,44158.675754,California,861360.0
4,Canada-Australia-New Zealand,1,8,1994-01-01,33.771626,13001.83,40330.894232,Colorado,104506.8


In [35]:
AcsCpsBeaCapital = pd.merge(AcsCpsBeaMerged,KSeries, on = ['statefip','year', 'StateName'], indicator=False, how = 'left')
AcsCpsBeaCapital.head()

Unnamed: 0,ImmigrantGroup,foreign,statefip,year,HoursSupplied,BodiesSupplied,Wage,StateName,NGdp,K
0,Canada-Australia-New Zealand,1,1,1994-01-01,8.951427,4148.02,32500.0,Alabama,90098.5,159398.3
1,Canada-Australia-New Zealand,1,2,1994-01-01,6.091535,2714.98,29623.315641,Alaska,23604.7,58737.85
2,Canada-Australia-New Zealand,1,4,1994-01-01,20.506824,9182.54,14849.310878,Arizona,100374.2,226487.6
3,Canada-Australia-New Zealand,1,6,1994-01-01,111.973079,46027.9,44158.675754,California,861360.0,2157399.0
4,Canada-Australia-New Zealand,1,8,1994-01-01,33.771626,13001.83,40330.894232,Colorado,104506.8,235920.6


# Add in the Price Deflators

In [36]:
PriceDf = (pd.read_csv(Paths['data'] + '/GdpPriceDeflator.csv')
           .rename(columns={'Unnamed: 0':'year','0':'P'})
           .assign(year = lambda x: pd.to_datetime(x['year']))
)
AcsCpsBeaMerged = (pd.merge(AcsCpsBeaCapital,PriceDf,how='left',on='year', indicator=True)
                   .pipe(lambda x: x[x['_merge'] != 'left_only']) # Dates in 2024 (data not avail yet)
                   .drop(columns = ['_merge'])
)
AcsCpsBeaMerged.head()

Unnamed: 0,ImmigrantGroup,foreign,statefip,year,HoursSupplied,BodiesSupplied,Wage,StateName,NGdp,K,PriceDeflator,InvestmentDeflator
0,Canada-Australia-New Zealand,1,1,1994-01-01,8.951427,4148.02,32500.0,Alabama,90098.5,159398.3,65.564,80.969
1,Canada-Australia-New Zealand,1,2,1994-01-01,6.091535,2714.98,29623.315641,Alaska,23604.7,58737.85,65.564,80.969
2,Canada-Australia-New Zealand,1,4,1994-01-01,20.506824,9182.54,14849.310878,Arizona,100374.2,226487.6,65.564,80.969
3,Canada-Australia-New Zealand,1,6,1994-01-01,111.973079,46027.9,44158.675754,California,861360.0,2157399.0,65.564,80.969
4,Canada-Australia-New Zealand,1,8,1994-01-01,33.771626,13001.83,40330.894232,Colorado,104506.8,235920.6,65.564,80.969


# Clean the Pre-Period-Data

In [37]:
# Initialize
Df = pd.DataFrame()

# Create a path object to the data
DataDir = Path(Paths['preperiod'])

# Construct file list
Files = list(DataDir.glob("*.xml"))

for f in Files:

    # Get the Data dictionary
    ddi = readers.read_ipums_ddi(f)
    df = readers.read_microdata(ddi, DataDir / ddi.file_description.filename)

    # Read data in and do some cleaning
    preperiod_df = (df
            .rename(columns=lambda x: x.lower())
            .drop(columns=['bpl'])                                # We will use the detailed variable
            .rename(columns={'bpld':'bpl'})               
            .assign(bpl = lambda x: x['bpl'].astype(str))         # Change this to string
            .assign(bpl = lambda x: x['bpl'].str.zfill(5))        # Uniform length 5
            .pipe(lambda x: x[~x['bpl'].str[0].isin(['8','9'])])  # Dropping those we can't identify a country of origin for
            .pipe(lambda x: x[x['age'] != 999])                   # Missing age
            .pipe(lambda x: x[x['age'] >= 16])                    # Drop if below the age of 16
            .assign(year = lambda x: pd.to_datetime(x['year'],format='%Y'))
            .assign(ImmigrantGroup = lambda x: x['bpl'].apply(ImmigrantGroup)) # Assign immigrant groups following Peri 2012
            .pipe(lambda x: x.loc[:,['perwt','statefip','year','ImmigrantGroup']])
            )

    # Create State-ImmigrantGroup table
    preperiod_collapse = (preperiod_df
                    .groupby(['ImmigrantGroup','statefip','year'])
                    .apply(lambda x: pd.Series({
                        'Count': np.dot(np.ones(len(x['perwt'])),x['perwt'])}))
                    .reset_index()
                    .assign(statefip = lambda x: x['statefip'].astype(str).str.zfill(2))
                    )
    
    Df = pd.concat([Df,preperiod_collapse])

See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.
See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.


In [38]:
# Prepare these for a large pivot
Df_Wide = (Df
           .assign(yearstr = lambda x: '_' + x['year'].astype(str).str[0:4])
           .assign(ImmigrantGroup = lambda x: x['ImmigrantGroup'].replace(
               {'Canada-Australia-New Zealand':'CaAuNz','United States':'US', 'Western Europe':'WestEu',
                'Latin America':'LA', 'Russia and Eastern Europe':'EastEu',
                'Rest of Asia':'AsiaOther'}))
           .assign(groupyear = lambda x: x['ImmigrantGroup'] + x['yearstr'])
           .drop(columns=['ImmigrantGroup', 'year', 'yearstr'])
           .pivot(columns=['groupyear'], index=['statefip'])
           .fillna(0)
           .pipe(lambda x: x.droplevel(0,axis=1))
           .reset_index()
           )

Df_Wide.head()


groupyear,statefip,Africa_1920,Africa_1930,Africa_1940,Africa_1950,Africa_1960,AsiaOther_1920,AsiaOther_1930,AsiaOther_1940,AsiaOther_1950,...,US_1920,US_1930,US_1940,US_1950,US_1960,WestEu_1920,WestEu_1930,WestEu_1940,WestEu_1950,WestEu_1960
0,1,99.63,0.0,0.0,0.0,0.0,398.52,1615.2,1554.0,820.0,...,1382884.76,1622670.3,1829254.0,1995881.0,2069707.0,10215.14,9691.2,9676.0,7065.0,5779.0
1,2,0.0,0.0,0.0,0.0,0.0,816.81,200.0,0.0,0.0,...,26554.32,30100.0,0.0,0.0,134994.0,3277.67,2700.0,0.0,0.0,1891.0
2,4,97.2,0.0,0.0,55.0,99.0,921.55,302.85,900.0,313.0,...,149890.63,225724.2,279896.0,459518.0,761075.0,10230.14,8984.55,5857.0,8264.0,12844.0
3,5,0.0,0.0,100.0,0.0,0.0,225.26,201.9,500.0,121.0,...,1018332.64,1167991.5,1269238.0,1262126.0,1172489.0,12325.25,6763.65,10029.0,5795.0,4287.0
4,6,314.1,1009.5,2000.0,2634.0,3788.0,61009.82,82577.1,70256.0,77795.0,...,1818134.64,3259372.65,4235989.0,6851779.0,9396205.0,357512.09,469215.6,417933.0,436017.0,453757.0


# Merge Acs/Cps with Pre Period

In [39]:
StateAnalysis = pd.merge(AcsCpsBeaMerged,Df_Wide, on=['statefip'], how='left', indicator=False)
StateAnalysis.head()

Unnamed: 0,ImmigrantGroup,foreign,statefip,year,HoursSupplied,BodiesSupplied,Wage,StateName,NGdp,K,...,US_1920,US_1930,US_1940,US_1950,US_1960,WestEu_1920,WestEu_1930,WestEu_1940,WestEu_1950,WestEu_1960
0,Canada-Australia-New Zealand,1,1,1994-01-01,8.951427,4148.02,32500.0,Alabama,90098.5,159398.3,...,1382884.76,1622670.3,1829254.0,1995881.0,2069707.0,10215.14,9691.2,9676.0,7065.0,5779.0
1,Canada-Australia-New Zealand,1,2,1994-01-01,6.091535,2714.98,29623.315641,Alaska,23604.7,58737.85,...,26554.32,30100.0,0.0,0.0,134994.0,3277.67,2700.0,0.0,0.0,1891.0
2,Canada-Australia-New Zealand,1,4,1994-01-01,20.506824,9182.54,14849.310878,Arizona,100374.2,226487.6,...,149890.63,225724.2,279896.0,459518.0,761075.0,10230.14,8984.55,5857.0,8264.0,12844.0
3,Canada-Australia-New Zealand,1,6,1994-01-01,111.973079,46027.9,44158.675754,California,861360.0,2157399.0,...,1818134.64,3259372.65,4235989.0,6851779.0,9396205.0,357512.09,469215.6,417933.0,436017.0,453757.0
4,Canada-Australia-New Zealand,1,8,1994-01-01,33.771626,13001.83,40330.894232,Colorado,104506.8,235920.6,...,521164.89,630129.9,700539.0,885157.0,1082450.0,51065.17,40077.15,31132.0,24838.0,24110.0


In [40]:
StateAnalysis.to_stata(Paths['data'] + '/StateAnalysisFile.dta', write_index=False, convert_dates={'year':'ty'})