In [1]:
from datetime import datetime
import logging
import os
import logging.handlers

import time
import requests
import json
import re
import csv
import pandas as pd

In [2]:
# NOTE: All directories the program used should be included as a global variable here
MAIN_DIR =  "D:\\Code\\PYTHON\\BLS_SCRAPER\\"
DATA_DIR = MAIN_DIR + f"Data\\"

CPI_DATA_DIR = DATA_DIR + f"CPI\\"
LAUS_DATA_DIR = DATA_DIR + f"LAUS\\"
FINAL_CPI_DATA_DIR = CPI_DATA_DIR + "Final\\"
FINAL_LAUS_DATA_DIR = LAUS_DATA_DIR + "Final\\"

# NOTE: Automatic Log Folder directory creation based on date.
# NOTE: The file iteself is created based on the time. 
LOG_DIR = MAIN_DIR + f"Log\\{datetime.now().strftime('%Y%m%d')}\\" 
LOG_FILE = LOG_DIR + f"Log_{datetime.now().strftime('%H%M%S')}.log"

In [3]:
def directory_setup(dir_list):
    '''
    DESCRIPTION -> If the directory does not exist it will create it
    '''
    for directory in dir_list:
        if not os.path.exists(directory):
            os.makedirs(directory)

def logging_setup():
    '''
    DESCRIPTION -> Setups the logging file for code
    '''
    try:
      handler = logging.handlers.WatchedFileHandler(os.environ.get("LOGFILE", LOG_FILE))
      formatter = logging.Formatter(fmt="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
      handler.setFormatter(formatter)
      logging.getLogger().handlers.clear()
      root = logging.getLogger()
      root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
      root.addHandler(handler)
      logging.propogate = False
      logging.info("Log File was created successfully.")
    except Exception as e:
        exit

In [4]:
# NOTE: All steps regrading setup should be completed here
DIR_LIST = [MAIN_DIR, LOG_DIR, DATA_DIR, CPI_DATA_DIR, LAUS_DATA_DIR, FINAL_CPI_DATA_DIR, FINAL_LAUS_DATA_DIR]
directory_setup(DIR_LIST)
logging_setup()

In [5]:
'''
	Series ID    LAUCN281070000000003
	Positions    Value            Field Name
	1-2          LA               Prefix
	3            U                Seasonal Adjustment Code
	4-18         CN2810700000000  Area Code
	19-20        03               Measure Code
'''
laus_merge_df = pd.read_csv(f"{LAUS_DATA_DIR}\\clean_area.csv")
laus_code_list = []
# Local Area Unemployment
prefix = "LA"
# Seasonality Adjustment
seasonality = "U"
# area code and measure code
measure_code = "03"

laus_code_list = []
for laus_area_code in laus_merge_df["area_code"]:
    laus_code_list.append(f"{prefix}{seasonality}{laus_area_code}{measure_code}")
logging.info(f"Total Unique LAUS Codes: {len(laus_code_list)}")

In [23]:
class bls_data_scraper:
    '''
    ------------------------------------------------------------------------------------------------------------
    -----------------------------------------------DESCRIPTION--------------------------------------------------
    ------------------------------------------------------------------------------------------------------------

    Passes the BLS json request and gets the data. 
    Afterwards it processes the data and enriches the data with some additional information about area and item names.

    ------------------------------------------------------------------------------------------------------------
    -----------------------------------------------PARAMETERS---------------------------------------------------
    ------------------------------------------------------------------------------------------------------------
    api_key -> API_KEY for BLS data queries
    out_file -> Location for Data to be outputted
    series_id -> All the series of CPI data that you want
    start_year -> Query start range
    end_year -> Query end range
    area_df -> Dataframe containing information on metro area codes and names
    item_df -> Dataframe containing information on item codes and names
    cpi_check -> If 1 then this is for CPI. If 0 then this is for LAUS.
    '''
    def __init__(self, api_key, out_file, series_id, start_year, end_year, area_df, item_df, cpi_check):
        headers = {"Content-type": "application/json"}
        parameters = json.dumps({
                                "seriesid":series_id, 
                                "startyear":start_year, 
                                "endyear":end_year, 
                                "registrationkey":api_key
                                })
        self.area_df = area_df
        self.item_df = item_df
        self.cpi_check = cpi_check
        # Requests the data from BLS
        json_data = self.get_data(headers, parameters)
        # Processes the data from BLS
        df_data = self.process_data(json_data, area_df, item_df, cpi_check)

        # Converts the data to an array to write -> Need to do this so that we have a single header
        list_df_data = df_data.values.tolist()

        # Writes the cleaned up data into the specified out_file
        with open(out_file , "a") as file:
            headers = df_data.columns.tolist()
            writer = csv.writer(file, delimiter=',', lineterminator='\n')
            if os.stat(out_file).st_size==0:
                writer.writerow(headers)
            for row in list_df_data:
                writer.writerow(row)


    def get_data(self, headers, parameters):
        '''
        DESCRIPTION -> Posts the url and we get the data back in a json format

        PARAM 1 -> headers -> self.header a BLS API requirement
        PARAM 2 -> parameters -> The data specification that you plan on querying
        '''
        post = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=parameters, headers=headers)
        json_data = json.loads(post.text)
        return json_data
    

    def process_data(self, json_data, area_df, item_df, cpi_check):
        '''
        DESCRIPTION -> Cleans and enriches the JSON data that we just processed

        PARAM 1 -> json_data -> The raw JSON data we pulled from BLS
        PARAM 2 -> area_df -> The area code and name information
        PARAM 3 -> item_df -> The item code and name information
        '''

        # NOTE: A lot of the data is stored inside multi-layed dictionaries/lists
        # All the information is stored under a three layer depth
        df = pd.json_normalize(json_data, record_path=["Results", "series", "data"], meta=[["Results", "series", "seriesID"]])
        df.rename(columns = {"Results.series.seriesID":"ID"}, inplace = True)

        if cpi_check == 1:
            # Parsing out the area_code and item_code from the entirety of the ID that we generated
            df["area_code"] = df["ID"].apply(lambda x: x[4:8])
            df["item_code"] = df["ID"].apply(lambda x: x[8:])

            # Enriching the data here
            df = pd.merge(df, area_df, how="left", on="area_code")
            df = pd.merge(df, item_df, how="left", on="item_code")
            df.drop(columns=["area_code", "item_code", "footnotes"], inplace=True)

            # # rearrange column ordering
            # name_list = df.columns.tolist()
            # name_list = name_list[-3:-2] + name_list[-2:-1] + name_list[-1:] + name_list[:-3]
            # df = df[name_list]
        else:
            
            df["area_code"] = df["ID"].apply(lambda x: x[3:18])
            df = pd.merge(df, area_df, how="outer", on="area_code")
            df.drop(columns=["latest", "footnotes", "period", "type_code", "area_name"], inplace=True)
        return df
    
    '''
-----------------BLS LAUS API CALLS HERE--------------------
'''
# Drexel API Key: "024f5a0ca6e7494cbec2ea4088cd4a9d"
# GMAIL API Key: "73df4bb81189431089fe2f247af35ce1"
api_key = "73df4bb81189431089fe2f247af35ce1"
start_year = 2010
end_year = 2022
if os.path.exists(f"{FINAL_LAUS_DATA_DIR}\\bls_laus_data.csv"):
    try:
        os.remove(f"{FINAL_LAUS_DATA_DIR}\\bls_laus_data.csv")
        logging.info("Removed old file for data.")
    except:
            logging.info("Did not remove old file for data.")

# NOTE: Technically we don't need to make an exception for the LAUS dataset. However, since we only have one item this just makes life easier. 
# NOTE: If we wanted to though, this could just be one general method that works for both cases.
for x in range(0, len(laus_code_list), 50):
    code_chunk = laus_code_list[x:x+50]
    bls_data_scraper(api_key, f"{FINAL_LAUS_DATA_DIR}\\bls_laus_data.csv", code_chunk, start_year, end_year, laus_merge_df,0,0)
    time.sleep(2)
logging.info("Done with BLS LAUS API Calls")

laus_check    object
type_code     object
area_code     object
area_name     object
dtype: object
year          object
period        object
periodName    object
latest        object
value         object
footnotes     object
ID            object
area_code     object
dtype: object
