# Load SIC Code descriptions 

Start date: March 19, 2024 
Author: Sahar H. El Abbadi

## Data sources: 

NAIC & SIC Crosswalk: https://www.naics.com/product/sic-naics-cross-references/
Downloaded March 19, 2024 

Saved in 01_raw_data > 2022-NAICS-to-SIC-Crosswalk.xlsx

In [10]:
# Set up

import pandas as pd 
import pathlib 

# Load SIC data 
sic_cols = ['Related SIC Code', 'Related SIC Code Description']
sic_data = pd.read_excel(pathlib.PurePath('01_raw_data', '2022-NAICS-to-SIC-Crosswalk.xlsx'), usecols=sic_cols)

# Rename columns 
sic_data.rename(columns={'Related SIC Code': 'sic_code', 'Related SIC Code Description': 'sic_description'}, inplace=True)
display(sic_data)

Unnamed: 0,sic_code,sic_description
0,116,Soybeans
1,119,"Cash Grains, Nec"
2,119,"Cash Grains, Nec"
3,111,Wheat
4,115,Corn
...,...,...
2345,9641,Regulation of Agricultural Marketing
2346,9651,"Regulation, Miscellaneous Commercial Sectors"
2347,9661,Space Research and Technology
2348,9711,National Security


In [11]:
# Write a function that will take SIC code and find the definition 
from pprint import pprint 

sic_dict = sic_data.set_index('sic_code')['sic_description'].to_dict()
pprint(sic_dict)

def lookup_sic_description(sic_code): 
    
    if sic_code in sic_dict:
        return sic_dict[sic_code]
    else: 
        return 'MISSING_DESCRIPTION'
    

{111: 'Wheat',
 112: 'Rice',
 115: 'Corn',
 116: 'Soybeans',
 119: 'Cash Grains, Nec',
 131: 'Cotton',
 132: 'Tobacco',
 133: 'Sugarcane and Sugar Beets',
 134: 'Irish Potatoes',
 139: 'Field Crops, Except Cash Grain',
 161: 'Vegetables and Melons',
 171: 'Berry Crops',
 172: 'Grapes',
 173: 'Tree Nuts',
 174: 'Citrus Fruits',
 175: 'Deciduous Tree Fruits',
 179: 'Fruits and Tree Nuts, Nec',
 181: 'Ornamental Nursery Products',
 182: 'Food Crops Grown Under Cover',
 191: 'General Farms, Primarily Crop',
 211: 'Beef Cattle Feedlots',
 212: 'Beef Cattle, Except Feedlots',
 213: 'Hogs',
 214: 'Sheep and Goats',
 219: 'General Livestock, Nec',
 241: 'Dairy Farms',
 251: 'Broiler, Fryer, and Roaster Chickens',
 252: 'Chicken Eggs',
 253: 'Turkeys and Turkey Eggs',
 254: 'Poultry Hatcheries',
 259: 'Poultry and Eggs, Nec',
 271: 'Fur-bearing Animals and Rabbits',
 272: 'Horses and Other Equines',
 273: 'Animal Aquaculture',
 279: 'Animal Specialties, Nec',
 291: 'General Farms, Primarily ani

In [12]:
def facility_sic_description(sic_array): 
    if sic_array == ['NO_SIC_MATCH']:
        return sic_array
    else: 
        sic_descriptions = []
        for sic in sic_array: 
            sic_descriptions.append(lookup_sic_description(sic))
    return sic_descriptions

test_array = [1442, 1499, 1623]

facility_sic_description(test_array)
            

['Construction Sand and Gravel',
 'Miscellaneous Nonmetallic Mining',
 'Water, Sewer, and Utility Lines']

In [18]:
# test on test pickle file 
import ast  # For parsing string representations of lists

# Custom function to convert string representation of list to list
def parse_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):  # Handle cases where the value is not a list
        return None

# Load the CSV file into a DataFrame
biosolids_to_remove = pd.read_csv(pathlib.PurePath('04_results', 'biosolids_no_sewer_code.csv'), converters={'sic_permits': parse_list})

biosolids_to_remove['sic_description'] = biosolids_to_remove['sic_permits'].apply(facility_sic_description)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Facility Name,NPDES ID,City,State,EPA Region,Reporting Year,Certified Date,Reporting Obligation(s),...,Violation Count,Violation Type(s),Management Practice Type(s),Amount of Biosolids Managed - Land Applied,Amount of Biosolids Managed - Surface Disposal,Amount of Biosolids Managed - Incinerated,Amount of Biosolids Managed - Other Management Practice,Pathogen Class(es),check_sewer_permits,sic_permits
0,861,3596,MSD BISSELL POINT WASTEWATER TREATMENT PLANT,MOL025178,ST. LOUIS,MO,7,2022,2023-02-17T16:46:20.643-0500,A Class I Sludge Management Facility as define...,...,0,,Incineration,,,23179.13803,,,other_system,"[1629, 1629, 1629]"
1,892,3669,MSD LEMAY WASTEWATER TREATMENT PLANT,MOL025151,ST. LOUIS,MO,7,2022,2023-02-16T16:17:55.804-0500,A Class I Sludge Management Facility as define...,...,0,,Incineration,,,14378.81000,,,other_system,"[1629, 1629, 1629]"
2,985,3870,LYSTEK INTERNATIONAL,CAL000001,FAIRFIELD,CA,9,2022,2023-02-21T11:44:12.082-0500,A Class I Sludge Management Facility as define...,...,0,,Land Application,10635.0,,,,Class A,other_system,"[7538, 7538, 4212, 4212, 7513, 7513, 8211, 399..."
3,695,3117,WEST COUNTY WASTEWATER DISTRICT WWTP,CAL038539,RICHMOND,CA,9,2022,2023-02-07T14:04:03.557-0500,A POTW with a design flow rate equal to or gre...,...,0,,Other Management Practice,,,,8032.0,Class B,other_system,[4941]
4,399,2190,JEFFERSON WWTF,TX0002542,LAREDO,TX,6,2022,2023-01-24T16:58:04.675-0500,A POTW with a design flow rate equal to or gre...,...,0,,Surface Disposal,,7364.05,,,Class B,other_system,"[4941, 4941, 4941]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,344,2031,PEARLAND ACRES MHP WWTF,TX0135283,WSTLAKE VILLAGE,CA,6,2022,2023-01-19T16:24:33.517-0500,A Class I Sludge Management Facility as define...,...,0,,Other Management Practice,,,,,Not Applicable,other_system,"[6515, 6515]"
296,361,2071,NEW DANVILLE COMMUNITY WWTP,TX0132012,WILLIS,TX,6,2022,2023-03-17T13:22:09.006-0500,Other,...,0,,Other Management Practice,,,,,Not Applicable,other_system,[8361]
297,504,2500,ROCKINGHAM COUNTY HOME,NHL100609,BRENTWOOD,NH,1,2022,2023-02-14T12:24:08.520-0500,A Class I Sludge Management Facility as define...,...,0,,Other Management Practice,,,,,Not Applicable,other_system,"[8051, 9223, 8051, 9223]"
298,565,2665,BARDSTOWN TOWN CREEK WWTP,KYL020237,BARDSTOWN,KY,4,2022,2023-01-30T10:25:43.354-0500,A Class I Sludge Management Facility as define...,...,0,,Other Management Practice,,,,,Not Applicable,other_system,"[9511, 9511, 4941]"


0      [Heavy Construction, Nec, Heavy Construction, ...
1      [Heavy Construction, Nec, Heavy Construction, ...
2      [General Automotive Repair Shops, General Auto...
3                                         [Water Supply]
4             [Water Supply, Water Supply, Water Supply]
                             ...                        
295    [Mobile Home Site Operators, Mobile Home Site ...
296                                   [Residential Care]
297    [Skilled Nursing Care Facilities, Correctional...
298    [Air, Water, and Solid Waste Management, Air, ...
299                                       [Water Supply]
Name: sic_description, Length: 300, dtype: object

In [23]:
from utilities import summarize_sic_codes

summarize_sic_codes(biosolids_to_remove, 'sic_description').to_csv(pathlib.PurePath('04_results', 'biosolids_no_sewer_code_sic_description_count.csv'))
summarize_sic_codes(biosolids_to_remove, 'sic_permits').to_csv(pathlib.PurePath('04_results', 'biosolids_no_sewer_code_sic_count.csv'))

Unique entries with counts:
sic_description
(Mobile Home Site Operators,)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               