# Explore NAICS dataset 

Author: Sahar H. El Abbadi
Start date: March 20, 2024 

Purpose: Load NAICS dataset to check if facilities in the biosolids dataset have NAICS codes

In [20]:
# Setup: 
import pandas as pd
import pathlib
from tqdm import tqdm

tqdm.pandas()

# Load NAICS data file 
naics = pd.read_csv(pathlib.PurePath('01_raw_data', 'NATIONAL_NAICS_FILE.CSV'))

In [10]:
# Filter for NPDES permits 

naics_npdes = naics[naics['PGM_SYS_ACRNM'] == 'NPDES']

display(naics_npdes)

Unnamed: 0,REGISTRY_ID,PGM_SYS_ACRNM,PGM_SYS_ID,INTEREST_TYPE,NAICS_CODE,PRIMARY_INDICATOR,CODE_DESCRIPTION
0,110000491735,NPDES,AKR06AE60,STORM WATER INDUSTRIAL,424710,PRIMARY,PETROLEUM BULK STATIONS AND TERMINALS.
4,110000491735,NPDES,AKR06AE60,ICIS-NPDES NON-MAJOR,424710,PRIMARY,PETROLEUM BULK STATIONS AND TERMINALS.
11,110000491744,NPDES,AK0000370,ICIS-NPDES NON-MAJOR,424710,PRIMARY,PETROLEUM BULK STATIONS AND TERMINALS.
22,110000491780,NPDES,AKR06AB58,STORM WATER INDUSTRIAL,424710,PRIMARY,PETROLEUM BULK STATIONS AND TERMINALS.
24,110000491780,NPDES,AKR06AB58,ICIS-NPDES NON-MAJOR,424710,PRIMARY,PETROLEUM BULK STATIONS AND TERMINALS.
...,...,...,...,...,...,...,...
2169296,110064241850,NPDES,WYG650007,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
2169318,110069422976,NPDES,WYR001454,STORM WATER INDUSTRIAL,484220,PRIMARY,SPECIALIZED FREIGHT (EXCEPT USED GOODS) TRUCKI...
2169322,110069422976,NPDES,WYR001454,ICIS-NPDES NON-MAJOR,484220,PRIMARY,SPECIALIZED FREIGHT (EXCEPT USED GOODS) TRUCKI...
2169410,110070110354,NPDES,WYPU00103,ICIS-NPDES MINOR,333132,PRIMARY,OIL AND GAS FIELD MACHINERY AND EQUIPMENT MANU...


## Identify relevant NAICS codes 

Data source: https://www.ibisworld.com/classifications/naics/221320/sewage-treatment-facilities/

- 221310 = Water supply and irrigation systems 
- 221320 = Sewage treatment facilties 

In [15]:
# Look at interest type 

# INTEREST_TYPE can be "biosolids"

naics_npdes_biosolids = naics_npdes[naics_npdes['INTEREST_TYPE'] == 'BIOSOLIDS']
display(naics_npdes_biosolids)

naics_npdes_biosolids['CODE_DESCRIPTION'].value_counts()

naics_npdes_biosolids.to_csv(pathlib.PurePath('02_clean_data', 'naics_npdes_biosolids.csv'), index=False)


Unnamed: 0,REGISTRY_ID,PGM_SYS_ACRNM,PGM_SYS_ID,INTEREST_TYPE,NAICS_CODE,PRIMARY_INDICATOR,CODE_DESCRIPTION
119,110000759162,NPDES,AK0021474,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
124,110000760917,NPDES,AK0021458,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
125,110000761453,NPDES,AK0021385,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
912,110009005030,NPDES,AK0022951,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
978,110010112261,NPDES,AK0021890,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
...,...,...,...,...,...,...,...
2162907,110009762970,NPDES,WYG650009,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
2163565,110012801502,NPDES,WYG650001,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
2163793,110018910400,NPDES,WYG650003,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.
2163804,110020145387,NPDES,WYG650012,BIOSOLIDS,221320,PRIMARY,SEWAGE TREATMENT FACILITIES.


## Check biosolids facilities without sewer SIC codes 

Load the pickle file biosolids_data_sic_codes_not_sewer.pkl - this is the file with SIC codes that are not wastewater related. 

Check if any of these permits have a hit in the NAICS biosolids / sewer systems dataframe

In [46]:
# Write function to check a NPDES number against the naics_npdes_biosolids dataset 
naics_npdes_biosolids = pd.read_csv(pathlib.PurePath('02_clean_data', 'naics_npdes_biosolids.csv'))
def check_naics_biosolid_codes(npdes_id): 
    npdes_mask = naics_npdes_biosolids['PGM_SYS_ID'].str.contains(npdes_id)
    matched_npdes = naics_npdes_biosolids[npdes_mask]
    return matched_npdes['CODE_DESCRIPTION'].tolist()


In [48]:
biosolids_sic_not_sewer = pd.read_pickle(pathlib.PurePath('05_pickle_files', 'biosolids_data_sic_codes_not_sewer.pkl'))

biosolids_sic_not_sewer['naics_description'] = biosolids_sic_not_sewer['NPDES ID'].progress_apply(check_naics_biosolid_codes)

# Filter out rows where NAICS description is empty 
biosolids_sic_not_sewer_naics_not_empty = biosolids_sic_not_sewer[biosolids_sic_not_sewer['naics_description'].apply(lambda x: len(x) > 0)]

display(biosolids_sic_not_sewer_naics_not_empty[['NPDES ID', 'sic_permit', 'naics_description']])

100%|██████████| 1110/1110 [00:01<00:00, 997.73it/s] 


Unnamed: 0,NPDES ID,sic_permit,naics_description
3800,UTL020061,"[2899, 2899]",[ALL OTHER MISCELLANEOUS CHEMICAL PRODUCT AND ...
4135,UTL025763,"[7011, 7011]",[ALL OTHER TRAVELER ACCOMMODATION.]
