# Biosolids data debugging 

Explore biosolids dataset from EPA 

Author: Sahar H. El Abbadi
March 2024 

In [1]:
# set up 
import pandas as pd 
import pathlib 
from tqdm import tqdm 
import pickle
from utilities import check_all_sic_code
tqdm.pandas()

# Load biosolids data 
# all_biosolids = pd.read_csv(pathlib.PurePath('04_results', 'biosolids_with_ww_permits.csv')) # 
# biosolids_not_sewer = pd.read_csv(pathlib.PurePath('04_results', 'bioslids_not_sewer.csv')) # This dataset was created in 06_check_biosolids_list.py

# relevant column is "NPDES ID" 

In [6]:
# Make a pickle file for biosolids data with a column containing SIC code for each file 

all_biosolids = pd.read_csv(pathlib.PurePath('01_raw_data', 'Data_Download_1699657092121.csv'))

# test on top row 
# all_biosolids = all_biosolids.head(2).copy()
all_biosolids['sic_permit'] = all_biosolids['NPDES ID'].progress_apply(check_all_sic_code)
all_biosolids.to_pickle(pathlib.PurePath('05_pickle_files', 'biosolids_data_sic_codes.pkl'))

100%|██████████| 4182/4182 [4:34:18<00:00,  3.94s/it]     


In [7]:
all_biosolids.to_csv((pathlib.PurePath('04_results', 'biosolids_data_sic_codes.csv')))

In [4]:
tqdm.pandas()
from utilities import check_for_ww_permits

# Note: copied from 06_check_biosolids_list.py for the sake of having everything easily accessible in one notebook 

# Load data
biosolids_data = pd.read_csv(pathlib.PurePath('01_raw_data', 'Data_Download_1699657092121.csv'))
biosolids_data['check_sewer_permits'] = biosolids_data['NPDES ID'].progress_apply(check_for_ww_permits)

# Save dataset that has all entries in biosolids dataset, with new column indicating if it has a sewer permit 
biosolids_data.to_csv(pathlib.PurePath('04_results', 'biosolids_with_ww_permits.csv'))

# Filter by facilities that do not have 
biosolids_not_sewer = biosolids_data[biosolids_data['check_sewer_permits'] == 'other_system'].copy()
biosolids_not_sewer.to_csv(pathlib.PurePath('04_results', 'bioslids_not_sewer.csv'))
print('Number of NPDES permits in the bioslids dataset without associated sewer facilities:')
print(f'{len(biosolids_not_sewer)}')

100%|██████████| 4182/4182 [14:26<00:00,  4.83it/s]

Number of NPDES permits in the bioslids dataset without associated sewer facilities:
1110





# Biosolids dataset 

1. Load the biosolids data that does not have a sewer-related permit 
2. Match NPDES permits with all SIC codes 

In [4]:
# Check SIC code for permits, using the database of all SIC codes 

from utilities import check_all_sic_code
# Check for DW permits 
tqdm.pandas()

# Load biosolids dataset filtered to only contain 
biosolids_not_sewer['sic_permits'] = biosolids_not_sewer['NPDES ID'].progress_apply(check_all_sic_code)
display(biosolids_not_sewer)
biosolids_not_sewer.to_csv(pathlib.PurePath('04_results', 'biosolids_not_sewer_all_sic_codes.csv'))

100%|██████████| 1110/1110 [1:00:46<00:00,  3.28s/it]


Unnamed: 0.1,Unnamed: 0,Facility Name,NPDES ID,City,State,EPA Region,Reporting Year,Certified Date,Reporting Obligation(s),Amount of Biosolids Generated,Violation Count,Violation Type(s),Management Practice Type(s),Amount of Biosolids Managed - Land Applied,Amount of Biosolids Managed - Surface Disposal,Amount of Biosolids Managed - Incinerated,Amount of Biosolids Managed - Other Management Practice,Pathogen Class(es),check_sewer_permits,sic_permits
0,2,HANFORD WWTF,CAL001076,HANFORD,CA,9,2022,2023-02-10T12:15:23.575-0500,A Class I Sludge Management Facility as define...,676.9000,0,,Land Application,678.00,,,,Class A,other_system,NO_SIC_MATCH
1,14,AUSTIN COUNTY WSC PLANT 3,TX0125709,BELLVILLE,TX,6,2022,2022-09-07T10:35:30.119-0500,Other,0.0000,0,,Other Management Practice,,,,,Not Applicable,other_system,"[4941, 4941]"
2,15,LAKE PFLUGERVILLE WWTF,TX0132721,PFLUGERVILLE,TX,6,2022,2022-10-07T13:37:35.841-0500,A Class I Sludge Management Facility as define...,0.0000,0,,Other Management Practice,,,,,Class A,other_system,"[1541, 4941, 4941]"
3,16,PURTIS CREEK STATE PARK WWTP,TX0082856,AUSTIN,TX,6,2022,2022-08-01T11:19:59.102-0500,Other,0.0005,0,,Other Management Practice,,,,0.0005,Not Applicable,other_system,"[7033, 7033]"
4,25,CHISOS BASIN WWTP,TX0094684,BIG BEND NATIONAL PARK,TX,6,2022,2022-09-14T07:22:25.560-0500,Other,0.3750,0,,Other Management Practice,,,,0.3750,Not Applicable,other_system,"[7999, 7999]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,4157,PINOLE/HERCULES WPCP,CAL037796,PINOLE,CA,9,2022,2023-06-01T13:28:11.097-0500,A POTW with a design flow rate equal to or gre...,473.0000,0,,Other Management Practice,,,,473.0000,Class B,other_system,NO_SIC_MATCH
1106,4164,SOUTH WINDSOR WPCF,CTL100510,SOUTH WINDSOR,CT,1,2022,2023-07-26T11:19:07.287-0500,A POTW with a design flow rate equal to or gre...,875.0000,0,,Other Management Practice,,,,794.0000,Not Applicable,other_system,NO_SIC_MATCH
1107,4168,PATTERSON WQCF,CAL078735,PATTERSON,CA,9,2022,2023-09-27T18:14:56.033-0500,A POTW with a design flow rate equal to or gre...,286.7800,0,,Land Application,286.78,,,,Class B,other_system,NO_SIC_MATCH
1108,4171,NOONDAY WATER WELL 2 WWTF,TX0118362,TYLER,TX,6,2022,2023-09-19T14:18:14.733-0500,"A POTW that serves 10,000 people or more",0.0000,0,,Other Management Practice,,,,,Class B,other_system,[4941]


  biosolids_not_sewer.to_csv('04_results', 'biosolids_not_sewer_all_sic_codes.csv')


IsADirectoryError: [Errno 21] Is a directory: '04_results'

In [7]:
# Summarize findings 
def summarize_sic_codes(permit_list, col_name): 
    unique_entries_count = permit_list[col_name].apply(tuple).nunique()
    
    # Flatten lists and convert them to tuples for consistency
    flat_column = permit_list[col_name].apply(lambda x: tuple(x) if isinstance(x, list) else x)
    
    # Count unique entries and print them along with their counts
    unique_counts = flat_column.value_counts()
    print("Unique entries with counts:")
    print(unique_counts)
    return unique_counts

summarize_sic_codes(biosolids_not_sewer, 'sic_permits')

display(biosolids_not_sewer)

Unique entries with counts:
sic_permits
NO_SIC_MATCH                                  810
(6515,)                                        47
(4941,)                                        35
(8211, 8211)                                   34
(4941, 4941)                                   14
                                             ... 
(7992, 7999)                                    1
(3511, 3511, 7699, 3511, 7699, 3511, 7699)      1
(6531,)                                         1
(8063, 8063)                                    1
(9511, 9511, 1629, 1629, 1629)                  1
Name: count, Length: 111, dtype: int64


Unnamed: 0.1,Unnamed: 0,Facility Name,NPDES ID,City,State,EPA Region,Reporting Year,Certified Date,Reporting Obligation(s),Amount of Biosolids Generated,Violation Count,Violation Type(s),Management Practice Type(s),Amount of Biosolids Managed - Land Applied,Amount of Biosolids Managed - Surface Disposal,Amount of Biosolids Managed - Incinerated,Amount of Biosolids Managed - Other Management Practice,Pathogen Class(es),check_sewer_permits,sic_permits
0,2,HANFORD WWTF,CAL001076,HANFORD,CA,9,2022,2023-02-10T12:15:23.575-0500,A Class I Sludge Management Facility as define...,676.9000,0,,Land Application,678.00,,,,Class A,other_system,NO_SIC_MATCH
1,14,AUSTIN COUNTY WSC PLANT 3,TX0125709,BELLVILLE,TX,6,2022,2022-09-07T10:35:30.119-0500,Other,0.0000,0,,Other Management Practice,,,,,Not Applicable,other_system,"[4941, 4941]"
2,15,LAKE PFLUGERVILLE WWTF,TX0132721,PFLUGERVILLE,TX,6,2022,2022-10-07T13:37:35.841-0500,A Class I Sludge Management Facility as define...,0.0000,0,,Other Management Practice,,,,,Class A,other_system,"[1541, 4941, 4941]"
3,16,PURTIS CREEK STATE PARK WWTP,TX0082856,AUSTIN,TX,6,2022,2022-08-01T11:19:59.102-0500,Other,0.0005,0,,Other Management Practice,,,,0.0005,Not Applicable,other_system,"[7033, 7033]"
4,25,CHISOS BASIN WWTP,TX0094684,BIG BEND NATIONAL PARK,TX,6,2022,2022-09-14T07:22:25.560-0500,Other,0.3750,0,,Other Management Practice,,,,0.3750,Not Applicable,other_system,"[7999, 7999]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,4157,PINOLE/HERCULES WPCP,CAL037796,PINOLE,CA,9,2022,2023-06-01T13:28:11.097-0500,A POTW with a design flow rate equal to or gre...,473.0000,0,,Other Management Practice,,,,473.0000,Class B,other_system,NO_SIC_MATCH
1106,4164,SOUTH WINDSOR WPCF,CTL100510,SOUTH WINDSOR,CT,1,2022,2023-07-26T11:19:07.287-0500,A POTW with a design flow rate equal to or gre...,875.0000,0,,Other Management Practice,,,,794.0000,Not Applicable,other_system,NO_SIC_MATCH
1107,4168,PATTERSON WQCF,CAL078735,PATTERSON,CA,9,2022,2023-09-27T18:14:56.033-0500,A POTW with a design flow rate equal to or gre...,286.7800,0,,Land Application,286.78,,,,Class B,other_system,NO_SIC_MATCH
1108,4171,NOONDAY WATER WELL 2 WWTF,TX0118362,TYLER,TX,6,2022,2023-09-19T14:18:14.733-0500,"A POTW that serves 10,000 people or more",0.0000,0,,Other Management Practice,,,,,Class B,other_system,[4941]


# SIC Code Lookup 

Look up the most common SIC matches
- 6515 = Residential Mobile home site 
- 4941 = Water supply 
- 8211 = Elementary and secondary schools 
- 7992 = Golf course 
- 7033 = Recreational vehicle parks / campsites 
- 7032 = Sports and recreational camping 
- 9223 = Correctional facilities 
- 7033 = Recreational vehicle parks / campsites 
- 9223 = Correctional intstutions
- 7999 = Amusement / Recreation 


In [8]:
# Remove facilities except those without an SIC match 

biosolids_to_remove = biosolids_not_sewer[biosolids_not_sewer['sic_permits'] != ['NO_SIC_MATCH']]
display(biosolids_to_remove)
biosolids_to_remove.describe()
biosolids_to_remove.to_csv(pathlib.PurePath('04_results', 'biosolids_no_sewer_code.csv'))

Unnamed: 0.1,Unnamed: 0,Facility Name,NPDES ID,City,State,EPA Region,Reporting Year,Certified Date,Reporting Obligation(s),Amount of Biosolids Generated,Violation Count,Violation Type(s),Management Practice Type(s),Amount of Biosolids Managed - Land Applied,Amount of Biosolids Managed - Surface Disposal,Amount of Biosolids Managed - Incinerated,Amount of Biosolids Managed - Other Management Practice,Pathogen Class(es),check_sewer_permits,sic_permits
1,14,AUSTIN COUNTY WSC PLANT 3,TX0125709,BELLVILLE,TX,6,2022,2022-09-07T10:35:30.119-0500,Other,0.0000,0,,Other Management Practice,,,,,Not Applicable,other_system,"[4941, 4941]"
2,15,LAKE PFLUGERVILLE WWTF,TX0132721,PFLUGERVILLE,TX,6,2022,2022-10-07T13:37:35.841-0500,A Class I Sludge Management Facility as define...,0.0000,0,,Other Management Practice,,,,,Class A,other_system,"[1541, 4941, 4941]"
3,16,PURTIS CREEK STATE PARK WWTP,TX0082856,AUSTIN,TX,6,2022,2022-08-01T11:19:59.102-0500,Other,0.0005,0,,Other Management Practice,,,,0.0005,Not Applicable,other_system,"[7033, 7033]"
4,25,CHISOS BASIN WWTP,TX0094684,BIG BEND NATIONAL PARK,TX,6,2022,2022-09-14T07:22:25.560-0500,Other,0.3750,0,,Other Management Practice,,,,0.3750,Not Applicable,other_system,"[7999, 7999]"
5,35,LAUGHLIN AFB WWTP BLDG 1004,TX0022608,LAUGHLIN AIR FORCE BASE,TX,6,2022,2022-08-31T07:59:34.879-0500,Other,0.0000,0,,Other Management Practice,,,,,Not Applicable,other_system,"[9711, 1542, 9711, 9711, 1542, 4581, 9711, 971..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,3966,CITY OF MONETT,MOL021440,MONETT,MO,7,2022,2023-02-27T11:53:52.353-0500,A POTW with a design flow rate equal to or gre...,287.0000,0,,Land Application,574.00,,,,Class B,other_system,"[1629, 1629, 1629, 1629]"
1095,4133,MERIDIAN METROPOLITAN DISTRICT,COL039110,ENGLEWOOD,CO,8,2022,2023-03-23T10:29:23.333-0500,"A POTW that serves 10,000 people or more",207.7200,0,,Land Application,207.72,,,,Class B,other_system,[1629]
1096,4135,BLUE SKY RANCH AND RESORT,UTL025763,WANSHIP,UT,8,2022,2023-04-01T12:56:08.283-0500,A Class I Sludge Management Facility as define...,54.7200,0,,Land Application,54.72,,,,Class B,other_system,"[7011, 7011]"
1104,4156,"BOLIVAR, CITY OF",MOL022373,BOLIVAR,MO,7,2022,2023-05-22T16:11:11.867-0500,A POTW with a design flow rate equal to or gre...,190.4000,0,,"Land Application, Other Management Practice",30.40,,,160.0000,Class B,other_system,"[9511, 9511, 1629, 1629, 1629]"


In [20]:
# look at the facilities generate large amounts of biosolids 
biosolids_to_remove_large = biosolids_to_remove[biosolids_to_remove['Amount of Biosolids Generated'] > 1]

land_application_mask = biosolids_to_remove_large['Management Practice Type(s)'].str.contains('Land Application')
test = biosolids_to_remove_large[land_application_mask].copy()
display(test)

Unnamed: 0.1,Unnamed: 0,Facility Name,NPDES ID,City,State,EPA Region,Reporting Year,Certified Date,Reporting Obligation(s),Amount of Biosolids Generated,Violation Count,Violation Type(s),Management Practice Type(s),Amount of Biosolids Managed - Land Applied,Amount of Biosolids Managed - Surface Disposal,Amount of Biosolids Managed - Incinerated,Amount of Biosolids Managed - Other Management Practice,Pathogen Class(es),check_sewer_permits,sic_permits
11,94,MEADOWS WWTP,TX0021270,HOUSTON,TX,6,2022,2022-09-30T21:07:08.883-0500,Other,48.77,0,,Land Application,48.77,,,,Class B,other_system,"[4941, 4941]"
112,835,STEPHEN F AUSTIN HISTORICAL PARK,TX0032549,AUSTIN,TX,6,2022,2022-09-30T10:23:09.182-0500,Other,1.476312,0,,Land Application,1.476312,,,,Class B,other_system,"[7033, 7033]"
121,891,TURLOCK WQCF,CAL078948,TURLOCK,CA,9,2022,2023-02-18T21:02:18.796-0500,A POTW with a design flow rate equal to or gre...,3097.0,0,,"Land Application, Other Management Practice",2228.0,,,869.0,Class B,other_system,[4941]
123,897,CITY OF RICHMOND SWTP,TX0137600,RICHMOND,TX,6,2022,2022-09-27T14:10:57.840-0500,Other,139.323,0,,Land Application,139.0,,,,Class B,other_system,"[4941, 4941]"
145,1017,ROLLING RIDGE WWTF,TX0077526,COLLEGE STATION,TX,6,2022,2022-09-29T13:38:27.568-0500,Other,4.939,0,,Land Application,4.939,,,,Class B,other_system,[6515]
209,1490,WOODGATE MOBILE HOME VILLAGE,TX0088102,GLENDALE,CO,6,2022,2022-09-29T21:30:09.018-0500,Other,2.12,0,,Land Application,2.12,,,,Class B,other_system,[6515]
210,1491,WESTMONT MHP WWTP,TX0093505,HOUSTON,TX,6,2022,2022-09-29T21:41:37.583-0500,Other,12.502,0,,Land Application,12.502,,,,Class B,other_system,[6515]
211,1492,CARBY ROAD MHP WWTP,TX0123579,HOUSTON,TX,6,2022,2022-09-29T21:50:30.129-0500,Other,3.346,0,,Land Application,3.346,,,,Class B,other_system,[6515]
212,1496,RIVER OAKS MHP WWTP,TX0092711,HOUSTON,TX,6,2022,2022-09-29T22:24:12.982-0500,Other,3.066,0,,Land Application,3.066,,,,Class B,other_system,"[6515, 6515]"
213,1500,SUNDOWN MHP WWTF,TX0087785,HOUSTON,TX,6,2022,2022-09-29T22:37:44.836-0500,Other,2.531,0,,Land Application,2.53,,,,Class B,other_system,"[6515, 6515]"


# Check all_wwtps facilities removed 

We previously removed facilities from all_wwtps that were from ICIS2022 and did not have any sewer code. Check here what SIC codes those facilities have 

In [10]:
# Check for drinking water permits 
from utilities import check_for_dw_permits

all_wwtps_no_sewer_code = pd.read_csv(pathlib.PurePath('04_results', 'all_wwtps_no_sewer_code_with_SIC.csv'))

dw_mask = all_wwtps_no_sewer_code['SIC_permits'].str.contains('4941')
all_wwtps_no_sewer_code_has_dw_code = all_wwtps_no_sewer_code[dw_mask]
display(all_wwtps_no_sewer_code_has_dw_code)
# all_wwtps_dw_permits = all_wwtps_dw_permits[all_wwtps_dw_permits['SIC_permits'].apply(lambda x: 4941 in x)]

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID,check_sewer_permits,SIC_permits
8,1471,1471,1471,CO00885,CWNS2012,CO,"Rye, Town of Collection",Rye,37.9222,-104.9378,0.006,0.008296,8000185001,COG641125,other_system,[4941]
10,1921,1921,1921,FL01425,CWNS2012,FL,IRCUD/South Regional WWTF,0,27.587,-80.401,1.3,1.797407,12000116003,FL0037940,other_system,"[4941, 4941]"
16,2808,2808,2808,IA02365,CWNS2012,IA,FT MADISON WESTERLY STL,Fort Madison,40.5524,-91.4338,0.045,0.062218,19000304002,IA0081001,other_system,[4941]
22,3544,3544,3544,IL00155,CWNS2012,IL,CANTON OVERFLOW FACILITY,0,40.5642,-89.9777,0.28,0.387134,17000035002,ILG640037,other_system,[4941]
26,4539,4539,4539,IN03235,CWNS2012,IN,Campbell Township RSD,0,39.0494,-85.5289,0.25,0.345655,18004794001,IN0038539,other_system,"[8361, 4941, 8331, 9711, 9711, 8249, 9711]"
46,6213,6213,6213,MD01325,CWNS2012,MD,EASTERN PRERELEASE STP,0,39.1168,-75.988,0.01,0.013826,24000090009,MD0023876,other_system,"[9223, 4941, 9223, 9223, 9223, 4941, 9223]"
56,6732,6732,6732,MI03305,CWNS2012,MI,ONTONAGON STP,0,46.7708,-89.5625,0.42,0.580701,26003127001,MIG640212,other_system,[4941]
58,7080,7080,7080,MN02095,ICIS2022,MN,CRYSTAL LAKE FLOCCULATION TREATMENT FACILITY,ROBBINSDALE,45.021721,-93.3269,0.72,0.995487,0,MN0069957,other_system,"[4941, 4941]"
79,7630,7630,7630,MO05255,CWNS2012,MO,ELSBERRY WWTP,0,39.1625,-90.7762,0.19,0.262698,29003159001,MO0054691,other_system,[4941]
117,9484,9484,9484,NE04345,CWNS2012,NE,ULYSSES WWTP,0,40.8968,-97.3449,0.03,0.041479,31002470001,NE0024368,other_system,"[4941, 4941]"


In [11]:
# Run all SIC codes on all_wwtps facilities that don't have a sewer code 
all_wwtp_sewer_codes = pd.read_csv(pathlib.PurePath('04_results', 'facilities_with_ww_permits.csv'))

# Only look at facilities that don't have a sewer code - source can be CWNS (since I'm interested in what types of permits these facilities actually have!) 
no_sewer_mask = (all_wwtp_sewer_codes['check_sewer_permits'] == 'other_system' )

# Apply a filter based on source
all_wwtps_no_sewer_code = all_wwtp_sewer_codes[no_sewer_mask]
all_wwtps_no_sewer_code['SIC_permits'] = all_wwtps_no_sewer_code['NPDES_ID'].progress_apply(check_all_sic_code)

all_wwtps_no_sewer_code.to_csv(pathlib.PurePath('04_results', 'all_wwtps_no_sewer_code_with_SIC.csv'))


100%|██████████| 349/349 [19:02<00:00,  3.27s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_wwtps_no_sewer_code['SIC_permits'] = all_wwtps_no_sewer_code['NPDES_ID'].progress_apply(check_all_sic_code)


# Check facilities removed from all_wwtps

Check what SIC types are associated with the permits for the 21 facilities we decided to remove from all_wwtps. 

all_wwtps_icis_no_sewer_code.csv is the 05_check_icis_facilities.py
- Facilities that are sourced from ICIS 
- The associated NPDES permit is not sewer related 

In [12]:
# Check that the all_wwtps that we're removing from all_wwtps are not just empty SIC codes 

from utilities import check_all_sic_code
from tqdm import tqdm 
from utilities import lookup_sic_code
import pandas as pd 

tqdm.pandas()

all_wwtps_removed = pd.read_csv(pathlib.PurePath('04_results', 'all_wwtps_icis_no_sewer_code.csv'))
print('About to start:')
all_wwtps_removed['SIC_permits'] = all_wwtps_removed['NPDES_ID'].progress_apply(check_all_sic_code)

display(all_wwtps_removed)

all_wwtps_removed.to_csv(pathlib.PurePath('04_results', 'all_wwtps_icis_no_sewer_code_with_SIC.csv'))

About to start:


100%|██████████| 21/21 [01:06<00:00,  3.17s/it]


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID,check_sewer_permits,SIC_permits
0,2163,2163,CA05060,ICIS2022,CA,THUNDER VALLEY CASINO WWTP,LINCOLN,38.839111,-121.307556,0.875,1.209793,0,CA0084697,other_system,"[7011, 7011, 7011, 7011, 1522]"
1,2514,2514,FL03815,ICIS2022,FL,MILITARY POINT INDUSTRIAL LAGOON,PANAMA CITY,30.1156,-85.630533,37.0,51.156977,0,FL0002631,other_system,"[2621, 2621, 2611, 2621, 2611, 2611]"
2,5936,5936,KY02865,ICIS2022,KY,NEW MARION WWTP,MARION,37.331667,-88.081389,0.0,0.0,0,KY0113247,other_system,NO_SIC_MATCH
3,5943,5943,LA02955,ICIS2022,LA,MINDEN DIESEL POWER PLANT,MINDEN,32.609167,-93.290278,0.5,0.691311,0,LA0109886,other_system,"[4911, 4911, 4911, 4911]"
4,5944,5944,LA02960,ICIS2022,LA,MINDEN STEAM POWER PLANT,MINDEN,32.603611,-93.295,0.5,0.691311,0,LA0109894,other_system,"[4911, 4911, 4911, 4911]"
5,7080,7080,MN02095,ICIS2022,MN,CRYSTAL LAKE FLOCCULATION TREATMENT FACILITY,ROBBINSDALE,45.021721,-93.3269,0.72,0.995487,0,MN0069957,other_system,"[4941, 4941]"
6,11963,11963,OH08865,ICIS2022,OH,HEATHER HILL CARE COMMUNITIES,CHARDON,41.5363,-81.22501,0.1,0.138262,0,OH0083984,other_system,"[8051, 8051]"
7,14362,14362,TX06350,ICIS2022,TX,CITY OF DALLAS ELM FORK WTP,CARROLLTON,32.972222,-96.916388,5.0,6.913105,0,TX0002372,other_system,"[4941, 4941]"
8,14495,14495,TX07220,ICIS2022,TX,FISHING HARBOR WWTP,BROWNSVILLE,25.981388,-97.3375,0.25,0.345655,0,TX0100242,other_system,"[4491, 4491]"
9,14581,14581,TX07670,ICIS2022,TX,HOMER N TANNER JR REGIONAL WTP,AVINGER,32.863611,94.662222,0.0,0.0,0,TX0134490,other_system,"[4941, 4941]"


In [13]:
display(all_wwtps_removed)
summarize_sic_codes(all_wwtps_removed, 'SIC_permits')


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID,check_sewer_permits,SIC_permits
0,2163,2163,CA05060,ICIS2022,CA,THUNDER VALLEY CASINO WWTP,LINCOLN,38.839111,-121.307556,0.875,1.209793,0,CA0084697,other_system,"[7011, 7011, 7011, 7011, 1522]"
1,2514,2514,FL03815,ICIS2022,FL,MILITARY POINT INDUSTRIAL LAGOON,PANAMA CITY,30.1156,-85.630533,37.0,51.156977,0,FL0002631,other_system,"[2621, 2621, 2611, 2621, 2611, 2611]"
2,5936,5936,KY02865,ICIS2022,KY,NEW MARION WWTP,MARION,37.331667,-88.081389,0.0,0.0,0,KY0113247,other_system,NO_SIC_MATCH
3,5943,5943,LA02955,ICIS2022,LA,MINDEN DIESEL POWER PLANT,MINDEN,32.609167,-93.290278,0.5,0.691311,0,LA0109886,other_system,"[4911, 4911, 4911, 4911]"
4,5944,5944,LA02960,ICIS2022,LA,MINDEN STEAM POWER PLANT,MINDEN,32.603611,-93.295,0.5,0.691311,0,LA0109894,other_system,"[4911, 4911, 4911, 4911]"
5,7080,7080,MN02095,ICIS2022,MN,CRYSTAL LAKE FLOCCULATION TREATMENT FACILITY,ROBBINSDALE,45.021721,-93.3269,0.72,0.995487,0,MN0069957,other_system,"[4941, 4941]"
6,11963,11963,OH08865,ICIS2022,OH,HEATHER HILL CARE COMMUNITIES,CHARDON,41.5363,-81.22501,0.1,0.138262,0,OH0083984,other_system,"[8051, 8051]"
7,14362,14362,TX06350,ICIS2022,TX,CITY OF DALLAS ELM FORK WTP,CARROLLTON,32.972222,-96.916388,5.0,6.913105,0,TX0002372,other_system,"[4941, 4941]"
8,14495,14495,TX07220,ICIS2022,TX,FISHING HARBOR WWTP,BROWNSVILLE,25.981388,-97.3375,0.25,0.345655,0,TX0100242,other_system,"[4491, 4491]"
9,14581,14581,TX07670,ICIS2022,TX,HOMER N TANNER JR REGIONAL WTP,AVINGER,32.863611,94.662222,0.0,0.0,0,TX0134490,other_system,"[4941, 4941]"


Unique entries with counts:
SIC_permits
(9223, 9223)                              5
(4941, 4941)                              3
NO_SIC_MATCH                              2
(4911, 4911, 4911, 4911)                  2
(7011, 7011, 7011, 7011, 1522)            1
(2621, 2621, 2611, 2621, 2611, 2611)      1
(8051, 8051)                              1
(4491, 4491)                              1
(8063, 8063)                              1
(9223, 9223, 9223, 9223, 213)             1
(213, 213, 9223, 9223, 213, 252, 9223)    1
(213, 9223, 213, 9223)                    1
(4911, 4911)                              1
Name: count, dtype: int64


SIC_permits
(9223, 9223)                              5
(4941, 4941)                              3
NO_SIC_MATCH                              2
(4911, 4911, 4911, 4911)                  2
(7011, 7011, 7011, 7011, 1522)            1
(2621, 2621, 2611, 2621, 2611, 2611)      1
(8051, 8051)                              1
(4491, 4491)                              1
(8063, 8063)                              1
(9223, 9223, 9223, 9223, 213)             1
(213, 213, 9223, 9223, 213, 252, 9223)    1
(213, 9223, 213, 9223)                    1
(4911, 4911)                              1
Name: count, dtype: int64

In [14]:
# Summarize SIC code count  
summarize_sic_codes(all_wwtps_no_sewer_code, 'SIC_permits')

Unique entries with counts:
SIC_permits
NO_SIC_MATCH                                        208
(4941,)                                              10
(6552, 6552)                                          9
(4941, 4941)                                          9
(9223, 9223)                                          6
                                                   ... 
(6552, 6552, 6552, 6552)                              1
(9512, 4491, 4491)                                    1
(1629, 9512, 1629, 9512)                              1
(9512, 4231, 4231, 9512, 9512, 4231, 7999, 9512)      1
(4941, 4941, 4941)                                    1
Name: count, Length: 88, dtype: int64


SIC_permits
NO_SIC_MATCH                                        208
(4941,)                                              10
(6552, 6552)                                          9
(4941, 4941)                                          9
(9223, 9223)                                          6
                                                   ... 
(6552, 6552, 6552, 6552)                              1
(9512, 4491, 4491)                                    1
(1629, 9512, 1629, 9512)                              1
(9512, 4231, 4231, 9512, 9512, 4231, 7999, 9512)      1
(4941, 4941, 4941)                                    1
Name: count, Length: 88, dtype: int64

In [15]:
# Check EBMUD 

npdes = 'CAL037702'
test = check_all_sic_code(npdes)
print(test)

NO_SIC_MATCH


In [10]:
# Check if all NPDES IDs are unique in this dataset 
import pandas as pd 
import pathlib
all_biosolids = pd.read_pickle(pathlib.PurePath('05_pickle_files', 'biosolids_data_sic_codes.pkl'))

all_biosolids_nonunique = all_biosolids.duplicated(subset=['NPDES ID'], keep=False)
biosolids_non_unique_npdes = all_biosolids[all_biosolids_nonunique]
display(biosolids_non_unique_npdes)

# ALl NPDES IDs are unique! good :) 

Unnamed: 0,Facility Name,NPDES ID,City,State,EPA Region,Reporting Year,Certified Date,Reporting Obligation(s),Amount of Biosolids Generated,Violation Count,Violation Type(s),Management Practice Type(s),Amount of Biosolids Managed - Land Applied,Amount of Biosolids Managed - Surface Disposal,Amount of Biosolids Managed - Incinerated,Amount of Biosolids Managed - Other Management Practice,Pathogen Class(es),sic_permit
