<a href="https://colab.research.google.com/github/RedaElmar/DWBI_ETL/blob/master/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/jayrav13/bls_local_area_unemployment.git

Cloning into 'bls_local_area_unemployment'...
remote: Enumerating objects: 67, done.[K
remote: Total 67 (delta 0), reused 0 (delta 0), pack-reused 67[K
Unpacking objects: 100% (67/67), done.


In [15]:
!pip install 2to3


Collecting 2to3
  Downloading https://files.pythonhosted.org/packages/15/31/e44eeb0bc18c5cb2df7c1914e00241da329c88ee4cd0d7139e716d4519c6/2to3-1.0-py3-none-any.whl
Installing collected packages: 2to3
Successfully installed 2to3-1.0


In [None]:
!2to3 -w bls/retrieve.py

In [None]:
!2to3 -w bls/transform.py

In [2]:
#--------------------------------------
#
#   This file cleans employment ane unemployment data by County from the BLS Local
#   Area Unemployment statistics: https://download.bls.gov/pub/time.series/la/
#
#   The code produces a .csv with data for:
#       - Employment (level)
#       - Unemployment (level)
#       - Unemployment rate
#       - Labor force (level)
#
#   Data are available monthly from 1990-2017 (can be updated with data from the website above).
#--------------------------------------

import pandas as pd
import requests

#------------------------------------------------------
# Download and save .TXT files from BLS website into current directory

BLS_url = 'https://download.bls.gov/pub/time.series/la/'

filenames = ['la.area',
              'la.data.0.CurrentU90-94', 'la.data.0.CurrentU95-99',
              'la.data.0.CurrentU00-04', 'la.data.0.CurrentU05-09',
              'la.data.0.CurrentU10-14','la.data.0.CurrentU15-19']

for xx in filenames:
    dls = BLS_url+xx
    resp = requests.get(dls)

    output = open(xx+'.txt', 'wb')
    output.write(resp.content)
    output.close()
# Import area information
df_areas = pd.read_table('la.area.txt')
df_areas = df_areas[['area_code', 'area_text']]

# Only keep county information
df_areas = df_areas.loc[df_areas['area_code'].str.contains('CN')]
df_areas.reset_index(drop=True, inplace=True)

# Rename columns
df_areas.columns = ['area_code', 'countyname']

# Get county and state information
tmp = df_areas['countyname'].str.split(', ', expand=True)
df_areas['countyname'] = tmp[0]
df_areas['state'] = tmp[1]

# Remove whitespace
df_areas['area_code'] = df_areas['area_code'].map(lambda x: x.strip())
df_areas['countyname'] = df_areas['countyname'].map(lambda x: x.strip())
#df_areas['state'] = df_areas['state'].map(lambda x: x.strip())    # Doesn't work when missing states?


#------------------------------------------------------

In [45]:

def get_BLS_county_data(BLS_data_path, df_areas):
    '''
    BLS_data_path : path for the text file containing the BLS data
    df_areas      : dataframe containing BLS information about counties/areas
    '''
    # Import area information
    col_types = {'series_id': str, 'year': int, 'period': str, 'value': str, 'footnote_codes': str}
    df_bls_county = pd.read_table(BLS_data_path, dtype=col_types)
    
    # Remove white space from code..
    df_bls_county['series_id'] = df_bls_county['series_id                     '].map(lambda x: x.strip())
    
    # Convert 'value' to numeric (kind of slow...)
    df_bls_county['value'] = df_bls_county['       value'].apply(pd.to_numeric, errors='coerce')

    # Get variable code
    df_bls_county['var_code'] = df_bls_county['series_id'].str[-2:]

    # Get area code
    df_bls_county['series_id'] = df_bls_county['series_id'].astype(str).str[3:].str[:-2]

    # Get FIPS code (as string to preserve initial zeros)
    df_bls_county['FIPS'] = df_bls_county['series_id'].str[2:7]

    #------------------------------------------------------------
    # Only keep rows corresponding to counties
    df_bls_county = df_bls_county.loc[df_bls_county['series_id'].str.contains('CN')]

    # Drop columns, reset index
    df_bls_county = df_bls_county[['series_id','year','period','value','var_code','FIPS']]
    df_bls_county.reset_index(drop=True, inplace=True)

    # Rename codes with variable names, rename columns
    df_bls_county['var_code'] = df_bls_county['var_code'].map({'03': 'Unemployment_Rate', '04': 'Unemployment',
                                                                 '05': 'Employment', '06': 'Labor_Force'})
    df_bls_county.columns = ['area_code', 'year', 'month', 'value','variable_name', 'FIPS']

    # Drop month 13 (I think this is the year average?)
    df_bls_county = df_bls_county.loc[df_bls_county['month']!='M13']
    # Convert month to numeric values
    df_bls_county['month'] = pd.to_numeric(df_bls_county['month'].str[1:])

    #------------------------------------------------------------
    # Merge area names and data
    df_bls_county = pd.merge(df_bls_county, df_areas, how='inner', on='area_code')

    # Convert to wide-format table
    df_bls_county = df_bls_county.pivot_table(values='value', index=['area_code', 'FIPS', 'state', 'countyname',
                                                            'year', 'month'], columns='variable_name')
    df_bls_county.reset_index(inplace=True)
    df_bls_county.columns.name = None
    #------------------------------------------------------------
    print('Done!')

    return df_bls_county


In [46]:
df_unemp_10_14 = get_BLS_county_data('la.data.0.CurrentU10-14.txt', df_areas)

Done!


In [None]:
df_unemp_10_14.tail(30)

In [41]:
df_unemp_10_14.columns

Index(['series_id                     ', 'year', 'period', '       value',
       'footnote_codes'],
      dtype='object')

In [50]:

#------------------------------------------------------------
# Import all years of data

df_unemp_00_04 = get_BLS_county_data('la.data.0.CurrentU00-04.txt', df_areas)
df_unemp_05_09 = get_BLS_county_data('la.data.0.CurrentU05-09.txt', df_areas)
df_unemp_10_14 = get_BLS_county_data('la.data.0.CurrentU10-14.txt', df_areas)
df_unemp_15_19 = get_BLS_county_data('la.data.0.CurrentU15-19.txt', df_areas)

#------------------------------------------------------------
# Merge all year's data
df_unemp_county = df_unemp_00_04
df_unemp_county = df_unemp_county.append(df_unemp_05_09)
df_unemp_county = df_unemp_county.append(df_unemp_10_14)
df_unemp_county = df_unemp_county.append(df_unemp_15_19)

# Sort by year-month
df_unemp_county = df_unemp_county.sort_values(by=['area_code', 'year', 'month'], axis=0)

# Save to CSV
df_unemp_county[['FIPS', 'state', 'countyname', 'year', 'month','Employment', 'Labor_Force', 'Unemployment', 'Unemployment_Rate']].to_csv('BLS_county_employment.csv', index=False)

Done!


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
Done!
Done!


In [54]:
!ls -l -sh

total 705M
 43M -rw-r--r-- 1 root root  43M Dec 21 12:05 BLS_county_employment.csv
1.3M -rw-r--r-- 1 root root 1.3M Dec 21 11:41 BLS_Industry_Data_by_County.csv
4.0K drwxr-xr-x 3 root root 4.0K Dec 21 10:50 data
428K -rw-r--r-- 1 root root 428K Dec 21 11:07 la.area.txt
112M -rw-r--r-- 1 root root 112M Dec 21 11:07 la.data.0.CurrentU00-04.txt
112M -rw-r--r-- 1 root root 112M Dec 21 11:07 la.data.0.CurrentU05-09.txt
114M -rw-r--r-- 1 root root 114M Dec 21 11:07 la.data.0.CurrentU10-14.txt
114M -rw-r--r-- 1 root root 114M Dec 21 11:07 la.data.0.CurrentU15-19.txt
106M -rw-r--r-- 1 root root 106M Dec 21 11:07 la.data.0.CurrentU90-94.txt
107M -rw-r--r-- 1 root root 107M Dec 21 11:07 la.data.0.CurrentU95-99.txt
4.0K drwxr-xr-x 1 root root 4.0K Dec  2 22:04 sample_data


In [1]:
!git clone https://github.com/RedaElmar/DWBI_ETL

Cloning into 'DWBI_ETL'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 15 (delta 0), reused 12 (delta 0), pack-reused 0[K
Unpacking objects: 100% (15/15), done.


In [6]:
data=pd.read_csv("DWBI_ETL/data/BLS_county_employment_scraped.csv")

In [13]:
data.head()

Unnamed: 0,FIPS,state,countyname,year,month,Employment,Labor_Force,Unemployment,Unemployment_Rate
0,1001,AL,Autauga County,2000,1,20719.0,21672.0,953.0,4.4
1,1001,AL,Autauga County,2000,2,20795.0,21779.0,984.0,4.5
2,1001,AL,Autauga County,2000,3,20872.0,21710.0,838.0,3.9
3,1001,AL,Autauga County,2000,4,20950.0,21631.0,681.0,3.1
4,1001,AL,Autauga County,2000,5,20879.0,21641.0,762.0,3.5


In [14]:
L=list(data.state.unique())

In [15]:
len(L)

51