# Notebook to explore all WWTP dataset
Goal: to explore dataset to see if we can find the bugs that caused duplicates to occur 

In [2]:
# Setup 
import pandas as pd
import pathlib
from utilities import check_for_dw_permits

['sewer_system', 'drinking_water', 'sewer_system', 'sewer_system']


In [1]:
# Setup 
import pandas as pd
import pathlib
from utilities import check_for_dw_permits


all_wwtp = pd.read_csv(pathlib.PurePath('02_clean_data', 'all_wwtps_relevant_cols.csv'))
cwns_2004 = pd.read_excel(pathlib.PurePath('01_raw_data', 'CWNS2004.xlsx'), sheet_name='ETL')

In [2]:
# Clean cwns 2004

# replace nan in permits 
cwns_2004['PERMIT_NBR'] = cwns_2004['PERMIT_NBR'].fillna('NO_PERMIT_NBR')

# Problematic WWTP 

There were several wastewater treatment plants with problematic. Expore a bit more deeply here in order to better understand the bugs that might be causing duplicates. 

## Willow Creek WWTP 
When reviewing duplicates, there were two Willow Creek WWTPs that were assigned the same CWNS number but have different NPDES permits. 

In [3]:
# Find Willow Creek in CWNS 2004 and ALl WWTP 
facility_name = 'willow creek'
# cwns_2004_willow_crk = cwns_2004.loc[cwns_2004['FACILITY_NM'].str.contains(facility_name, case=False)]

willow_creek_cwns = cwns_2004[cwns_2004['FACILITY_NM'].str.contains(facility_name, case=False)]

willow_creek_all_wwtps = all_wwtp[all_wwtp['FACILITY'].str.contains(facility_name, case=False)]
willow_creek_all_wwtps.head()

Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
6929,8373,MT01205,CWNS2012,MT,WILLOW CREEK SEWER DIST.,0,45.8297,-111.6424,0.022,0.030418,30000203001,MT0025038
12164,14750,TX08885,ICIS2022,TX,WILLOW CREEK FARMS MUD WWTF,WALLER COUNTY,29.773611,-95.891111,0.48,0.663658,0,TX0128236
12165,14751,TX08890,ICIS2022,TX,WILLOW CREEK WWTP,PARKER COUNTY,32.511388,-98.048333,1.26,1.742102,0,TX0023779
12166,14752,TX08895,ICIS2022,TX,WILLOW CREEK WWTP,SMITHVILLE,30.008611,-97.125555,0.3,0.414786,48007518001,TX0113786


In [5]:
# check for NPDES permits of interest in CWNS 2004 

willow_crk_npdes = ['TX0023779', 'TX0113786', 'TX0026255'] 
willow_crk_npdes_cwns = cwns_2004[cwns_2004['PERMIT_NBR'].str.contains('|'.join(willow_crk_npdes), case=False)]
willow_crk_npdes_cwns.head()

Unnamed: 0,AF_NBR,P_EFFLUENT,FACILITY_NM,REVIEW_STATUS,LOCATION_CD,STATE,COUNTY_NM,WSHED_NM,AUTHORITY_NM,TREATMENT,...,P_DSRVRCH_NM,P_DSWSHED_NBR,P_DSWSHED_NM,P_DSLATITUDE,P_DSLONGITUDE,P_DSLATITUDE_POLYGON,P_DSLONGITUDE_POLYGON,E_TOTAL,P_TOTAL,F_TOTAL
13276,48007518001,Advanced Treatment I,Willow Creek WWTP,HA,TX,Texas,Harris,Spring.,DOWDELL PUD,Present and Future,...,WILLOW CR,,,,,,,0.15,0.3,0.3
14566,48004374003,Advanced Treatment I with Nutrient Removal,Pollard Creek WWTP,HA,TX,Texas,Palo Pinto,Middle Brazos-Palo Pinto.,MINERAL WELLS,Present and Future,...,BRAZOS R,,,,,,,1.45,2.35,2.35


In [6]:
# Check for NPDES permits of interest in all_wwtps 

willow_crk_npdes_all_wwtp = all_wwtp[all_wwtp['NPDES_ID'].str.contains('|'.join(willow_crk_npdes), case=False)]
willow_crk_npdes_all_wwtp.head()

Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
12165,14751,TX08890,ICIS2022,TX,WILLOW CREEK WWTP,PARKER COUNTY,32.511388,-98.048333,1.26,1.742102,0,TX0023779
12166,14752,TX08895,ICIS2022,TX,WILLOW CREEK WWTP,SMITHVILLE,30.008611,-97.125555,0.3,0.414786,48007518001,TX0113786


In [7]:
# CHeck for CWNS numbers of interest
# Duplicate CWNS number was 48007518001

willow_crk_cwns_num = '48007518001'
cwns_2004[cwns_2004['AF_NBR'] == willow_crk_cwns_num]
all_wwtp_wkcwns = all_wwtp[all_wwtp['CWNS_NUM'] == willow_crk_cwns_num]
all_wwtp_wkcwns.head()

Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
12166,14752,TX08895,ICIS2022,TX,WILLOW CREEK WWTP,SMITHVILLE,30.008611,-97.125555,0.3,0.414786,48007518001,TX0113786


In [8]:
# Check all_wwtps - the input file to Christina's code 

# This file contains one row that has the CWNS number that is a duplicate in the biosolids data: 
willow_crk_cwns_num = '48007518001'
all_wwtp_wkcwns = all_wwtp[all_wwtp['CWNS_NUM'] == willow_crk_cwns_num]

print('Facilities in all_wwtps with CWNS number 48007518001: ')
display(all_wwtp_wkcwns)

# CHeck for facilities in all_wwtps that have the name willow creek: 
print('Facilities in all_wwtps ctonaining the name Willow Creek in the name:')
facility_name = 'willow creek'
willow_creek_all_wwtps = all_wwtp[all_wwtp['FACILITY'].str.contains(facility_name, case=False)]
display(willow_creek_all_wwtps)

Facilities in all_wwtps with CWNS number 48007518001: 


Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
12166,14752,TX08895,ICIS2022,TX,WILLOW CREEK WWTP,SMITHVILLE,30.008611,-97.125555,0.3,0.414786,48007518001,TX0113786


Facilities in all_wwtps ctonaining the name Willow Creek in the name:


Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
6929,8373,MT01205,CWNS2012,MT,WILLOW CREEK SEWER DIST.,0,45.8297,-111.6424,0.022,0.030418,30000203001,MT0025038
12164,14750,TX08885,ICIS2022,TX,WILLOW CREEK FARMS MUD WWTF,WALLER COUNTY,29.773611,-95.891111,0.48,0.663658,0,TX0128236
12165,14751,TX08890,ICIS2022,TX,WILLOW CREEK WWTP,PARKER COUNTY,32.511388,-98.048333,1.26,1.742102,0,TX0023779
12166,14752,TX08895,ICIS2022,TX,WILLOW CREEK WWTP,SMITHVILLE,30.008611,-97.125555,0.3,0.414786,48007518001,TX0113786


# CWNS as Zero 
In trying to understand the cause of duplicates for Willow Creek WWTP, I found three different wastewater treatment plants based on NPDES permits. 

There were two that appeared in the index and were assigned CWNS # 48007518001. However, this CWNS number (only in CWNS 2004) had a third NPDES permit associated with it - meaning there are likely three different plants here.


In [9]:

wwtp_zero_cwns = all_wwtp.loc[all_wwtp['CWNS_NUM'] == '0'].copy()
wwtp_zero_cwns.to_csv(pathlib.PurePath('04_results', 'no_cwns.csv'))

# Check SIC code for all facilities in all_wwtps 



In [9]:
from utilities import flag_not_ww
from tqdm import tqdm 
tqdm.pandas() 

all_wwtp = pd.read_csv(pathlib.PurePath('02_clean_data', 'all_wwtps_relevant_cols.csv'))

# Flag rows for review that have an NPDES permit that is not wastewater
all_wwtp['ww_sic_code'] = all_wwtp['NPDES_ID'].progress_apply(flag_not_ww)
all_wwtp_for_review = all_wwtp[all_wwtp['ww_sic_code'] == 'REVIEW'].copy()
print(all_wwtp_for_review.shape)
all_wwtp_for_review.to_csv(pathlib.PurePath('04_results', 'flag_all_wwtp.csv'))

100%|██████████| 13533/13533 [48:29<00:00,  4.65it/s]

(285, 13)





In [5]:
# Look at the facilities that were flagged for additional review 
import pandas as pd 
import pathlib
from utilities import check_for_dw_permits, view_all_sic_codes
from tqdm import tqdm 

tqdm.pandas() 

# generate list to review (comment out because it takes a little while to run) 
review_wwtp = pd.read_csv(pathlib.PurePath('04_results', 'flag_all_wwtp.csv'))
review_wwtp['sic_code_type'] = review_wwtp['NPDES_ID'].progress_apply(check_for_dw_permits)
review_wwtp.to_csv(pathlib.PurePath('04_results', 'all_wwtp_review_check_permits.csv'))

review_wwtp = pd.read_csv(pathlib.PurePath('04_results', 'all_wwtp_review_check_permits.csv'))
print(f'No. of facilities with a permit associated with an SIC code not for wastewater: {len(review_wwtp)}')


100%|██████████| 285/285 [01:00<00:00,  4.72it/s]

No. of facilities with a permit associated with an SIC code not for wastewater: 285





In [6]:

# For testing: top 10 facilities 
# review_wwtp = review_wwtp.head(10)

all_sics_for_review = pd.DataFrame()

for facility_npdes in tqdm(review_wwtp['NPDES_ID']): 
    facility_sic = view_all_sic_codes(facility_npdes)
    all_sics_for_review = pd.concat([all_sics_for_review, facility_sic], ignore_index=True)
    
# print(all_sics_for_review)

# FLag for further review facilities that have: 
## - Any permit for 'other_system' 
## - Ay facility that has more than 1 drinking water permit, or if they only have 1 permit, it is for drinking water
# With this approach, we are not considering facilities that have multiple permits and only one of them is for drinking water .

manual_check = all_sics_for_review[all_sics_for_review['flag'] == 'REVIEW']
# display(manual_check)

no_unique_npdes = manual_check['NPDES_ID'].nunique()
print(f'Number of unique permits to review: {no_unique_npdes}')

unique_npdes = manual_check['NPDES_ID'].unique() 
print(f'Facilities to check permit status (NPDES): {unique_npdes}')

display(all_sics_for_review)


100%|██████████| 285/285 [01:00<00:00,  4.72it/s]

Number of unique permits to review: 276
Facilities to check permit status (NPDES): ['AK0022551' 'AL0022195' 'AL0055841' 'AL0023418' 'AL0061158' 'AL0062715'
 'AL0050547' 'AL0023922' 'AL0024180' 'AL0058572' 'AR0033359' 'AR0020087'
 'AR0022381' 'AR0021890' 'AR0022098' 'CA0024490' 'CA0023345' 'CA0037702'
 'CA0037851' 'CA0047856' 'CA0053813' 'CA0054216' 'CA0105350' 'CA0110604'
 'CO0026671' 'COG641125' 'CO0031232' 'COG589086' 'CO0046507' 'FL0021857'
 'FL0021865' 'FL0037940' 'FL0116009' 'GA0020982' 'GA0023132' 'GA0026221'
 'IA0020796' 'IA0081001' 'IA0057169' 'ILG640037' 'IL0026352' 'IL0024911'
 'ILG580004' 'ILG580167' 'IL0025119' 'ILG580053' 'IN0024783' 'IN0025763'
 'IN0040738' 'IN0023108' 'IN0038539' 'IN0023337' 'IN0039241' 'IN0040789'
 'IN0039306' 'IN0022489' 'IN0063088' 'KS0089176' 'KY0073377' 'KY0057193'
 'KY0021130' 'KY0024279' 'KYP000063' 'LA0054925' 'LA0059951' 'LA0038059'
 'LA0043656' 'MD0021598' 'MD0020281' 'MD0020001' 'MD2038U99' 'MD0023876'
 'MD0023868' 'MD0051667' 'MD0024988' 'MD0




Unnamed: 0,NPDES_ID,sic_cod_no,sic_facility_type,flag
0,AK0022551,1,sewer_system,REVIEW
1,AK0022551,2,sewer_system,REVIEW
2,AK0022551,3,sewer_system,REVIEW
3,AK0022551,4,sewer_system,REVIEW
4,AK0022551,5,sewer_system,REVIEW
...,...,...,...,...
1308,WV0028151,1,sewer_system,REVIEW
1309,WV0028151,2,sewer_system,REVIEW
1310,WV0028151,3,sewer_system,REVIEW
1311,WV0028151,4,sewer_system,REVIEW


In [10]:
# Find details on facilities in all_wwtps that have NPDES code 
all_wwtp = pd.read_csv(pathlib.PurePath('02_clean_data', 'all_wwtps_relevant_cols.csv'))
wwtps_water_codes = all_wwtp[all_wwtp['NPDES_ID'].isin(unique_npdes)]
display(wwtps_water_codes)
wwtps_water_codes.to_csv(pathlib.PurePath('04_results', 'wwtps_any_water_code.csv'))



Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
3,3,AK00035,CWNS2012,AK,"POINT WORONZOF, JOHN ASPLUND WWTP",ANCHORAGE,61.1964,-150.0235,29.66,41.008539,2000106001,AK0022551
24,25,AL00065,CWNS2012,AL,ANNISTON CHOCCOLOCCO WWTP,ANNISTON,33.6032,-85.8293,8.80,12.167065,1000007001,AL0022195
84,85,AL00665,CWNS2012,AL,GULF SHORES WWTP,Gulf Shores,30.2766,-87.6666,1.50,2.073932,1000074001,AL0055841
99,100,AL00815,CWNS2012,AL,Jasper Town Creek WWTP,Jasper,33.8125,-87.2694,3.00,4.147863,1000088001,AL0023418
128,130,AL01115,CWNS2012,AL,PARRISH HCR LAGOON,Parrish,33.7194,-87.2990,0.10,0.138262,1000132001,AL0061158
...,...,...,...,...,...,...,...,...,...,...,...,...
13394,16245,WV01425,CWNS2012,WV,MOUNDSVILLE WWTP,MOUNDSVILLE,39.9177,-80.7473,1.30,1.797407,54002509001,WV0023264
13403,16255,WV01525,CWNS2012,WV,BLUEWELL PSD WWTP,BLUEFIELD,37.3019,-81.2517,0.40,0.553048,54002704001,WVG640100
13409,16261,WV01585,CWNS2012,WV,MNTN TOP PSD - ELK GARDEN WWTP,MT STORM,39.2745,-79.3626,0.05,0.069131,54002801001,WV0101524
13444,16299,WV01965,CWNS2012,WV,KINGWOOD WWTP,KINGWOOD,39.4468,-79.6781,0.57,0.788094,54003903001,WV0021881


In [11]:
# Only consider plants that have source as ICIS2022

wwtps_water_code_from_ICIS2022 = wwtps_water_codes[wwtps_water_codes['SOURCE'] == 'ICIS2022']
display(wwtps_water_code_from_ICIS2022)

# wwtps_multiple_water_codes_large_plants = wwtps_multiple_water_codes[wwtps_multiple_water_codes['FLOW_2012_MGD'] > 10]
# display(wwtps_multiple_water_codes_large_plants)

Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
3127,4070,IL05315,ICIS2022,IL,"EUREKA STP, CITY OF",EUREKA,40.701667,-89.27,0.59,0.815746,17000299002,IL0025119
3213,4156,IL05730,ICIS2022,IL,LAKE OF EGYPT SEWER DIST STP,MARION,37.626667,-88.941667,0.225,0.31109,0,ILG580053
5649,7016,MN02170,ICIS2022,MN,FAIRFAX WWTP,FAIRFAX,44.514471,-94.691443,0.0,0.0,27009223001,MNG585060
5713,7080,MN02095,ICIS2022,MN,CRYSTAL LAKE FLOCCULATION TREATMENT FACILITY,ROBBINSDALE,45.021721,-93.3269,0.72,0.995487,0,MN0069957
6491,7876,MN02885,ICIS2022,MN,SMSC WATER RECLAMATION FACILITY,PRIOR LAKE,44.72507,-93.46468,0.63,0.871051,0,MN0067938
7425,9054,ND03035,ICIS2022,ND,FORT YATES LAGOON SYSTEM,FORT YATES,46.0815,-100.6525,0.385,0.532309,0,NDG589312
10867,13341,SD00475,ICIS2022,SD,"EDGEMONT, CITY OF","EDGEMONT, CITY OF",43.302222,-103.807889,0.1,0.138262,46000031001,SD0023701
10905,13379,SD00745,ICIS2022,SD,"MILBANK, CITY OF",MILBANK,45.227056,-96.624,0.1,0.138262,46000050001,SD0020371
10941,13415,SD00815,ICIS2022,SD,"NORTH SIOUX CITY, CITY OF",NORTH SIOUX CITY,42.541333,-96.528528,0.1,0.138262,0,SD0020567
10943,13417,SD01025,ICIS2022,SD,"SPEARFISH, CITY OF","SPEARFISH, CITY OF",44.550351,-103.866191,0.1,0.138262,46000098001,SD0020044


In [11]:
# Why aren't all facilties from duplicates showing up? 
from utilities import view_all_sic_codes
import pandas as pd
import pathlib 
all_wwtp = pd.read_csv(pathlib.PurePath('02_clean_data', 'all_wwtps_relevant_cols.csv'))

# Load the duplicates data 
wwtp_duplicates = pd.read_csv(pathlib.PurePath('04_results', 'biosolids_duplicates_sic_code_assignment.csv'))
wwtp_duplicates_dw = wwtp_duplicates[wwtp_duplicates['sic_code_type'] == 'drinking_water']

display(wwtp_duplicates_dw)

Unnamed: 0.1,Unnamed: 0,Amount of Biosolids Generated,City,State,Facility Name - Biosolids,Facility Name - all_wwtps,Facility Name - master_match,NPDES ID - Biosolids,NPDES match - all_wwtps,NPDES match - master_match,...,CWNS - Jenn,CWNS - Final,SIC_CODE,CODE_DESCRIPTION,INTEREST_TYPE(s),Notes,Unnamed: 18,Unnamed: 19,Unnamed: 20,sic_code_type
19,19,52.15,GROESBECK,TX,GROESBECK WTP,,,TX0117587,,,...,48003050000.0,48003046001,,,,Probably the same,,,,drinking_water
21,21,2.2,DECATUR,TX,DECATUR WTP,,,TX0136204,,,...,48004230000.0,48004234001,,,,Probably the same,,,,drinking_water
24,24,541.83,HOUSTON,TX,IMPERIAL VALLEY WWTP,Imperial Valley WWTP,IMPERIAL VALLEY WWTP,TX0020478,TX0020478,TX0020478,...,48007040000.0,48007039067,,,,Manual check,,,,drinking_water
33,33,64.22,BELLAIRE,TX,HARRIS COUNTY MUD 24 WWTP,,,TX0113123,,,...,,48007556001,,,,Keep with CWNS number,,,,drinking_water
38,38,1.8,JASPER,TX,FOREST HILLS WWTP,,,TX0031283,,,...,48007870000.0,48007874001,,,,Manual check,,,,drinking_water
43,43,12.63,MENARD,TX,MENARD WTP,,,TX0125458,,,...,48009010000.0,48009009001,,,,Accidental duplication? Manual check,,,,drinking_water


In [15]:
import numpy as np 
from utilities import view_all_sic_codes, lookup_sic_code


def lookup_facility_by_name(facility_name):
    all_wwtp = pd.read_csv(pathlib.PurePath('02_clean_data', 'all_wwtps_relevant_cols.csv'))
    cwns = pd.read_csv(pathlib.PurePath('01_raw_data', 'CWNS_metadata_all.csv'))
    
    print('Facility in all_wwtps:')
    facility_all_wwtp = all_wwtp[all_wwtp['FACILITY'].str.contains(facility_name, case=False)]
    display(facility_all_wwtp)
    
    print('Facility in CWNS:')
    facility_cwns = cwns[cwns['FACILITY_NAME'].str.contains(facility_name, case=False)]
    display(facility_cwns)
    
    return facility_all_wwtp, facility_cwns

imperial_all_wwtps, imperial_facility_cwns = lookup_facility_by_name('groesbeck')
print(imperial_all_wwtps['NPDES_ID'])
view_all_sic_codes(imperial_all_wwtps['NPDES_ID'].iloc[0])

print(all_wwtp[all_wwtp['NPDES_ID'] == imperial_all_wwtps['NPDES_ID'].iloc[0]])

npdes_1 = 'TX0054445'# all wwtps NPDES_ID TX 0054445
npdes_2 = 'TX0117587' # biosolids NPDES for groesbeck 
print(f'{npdes_1} all_wwtps NPDES ID: SIC code is')
print(lookup_sic_code(npdes_1))
print(f'{npdes_2} biosolids NPDES ID: SIC code is')
print(lookup_sic_code(npdes_2))


Facility in all_wwtps:


Unnamed: 0.1,Unnamed: 0,FACILITY_CODE,SOURCE,STATE,FACILITY,CITY,LATITUDE,LONGITUDE,FLOW_2012_MGD,2012_TOT_ANNUAL_MM3,CWNS_NUM,NPDES_ID
11939,14525,TX07380,ICIS2022,TX,GROESBECK WWTP,LIMESTONE COUNTY,31.526944,-96.519444,0.709,0.980278,48003046001,TX0054445


Facility in CWNS:


Unnamed: 0.1,Unnamed: 0,CWNS_NUM,FACILITY_NAME,STATE,PRIMARY_COUNTY,AUTHORITY,PERMIT_NBR,LATITUDE,LONGITUDE,IN 2000 REPORT?,IN 2004 REPORT?,IN 2008 REPORT?,IN 2012 REPORT?,MOST_RECENT_REPORT
6314,6314,48003046001,Groesbeck WWTP,TX,Limestone,GROESBECK,TX0054445,31.5286 N,96.5197 W,True,True,True,True,2012


11939    TX0054445
Name: NPDES_ID, dtype: object
       Unnamed: 0 FACILITY_CODE    SOURCE STATE        FACILITY  \
14525       16312       TX07380  ICIS2022    TX  GROESBECK WWTP   

                   CITY   LATITUDE  LONGITUDE  FLOW_2012_MGD  \
14525  LIMESTONE COUNTY  31.526944 -96.519444          0.709   

       2012_TOT_ANNUAL_MM3  ... PROJ_RES_TOTAL_RECEIVNG_TRMT  \
14525             0.980278  ...                          NaN   

       PRES_RES_ONSITE_WTS  PRES_RES_ONSITE_WTS_UNITS PROJ_RES_ONSITE_WTS  \
14525                  NaN                        NaN                 NaN   

       PROJ_RES_ONSITE_WTS_UNITS  ICIS_2022_TOTAL_MGD ICIS_ACTIVE_2012  \
14525                        NaN                0.709              1.0   

       ICIS_ACTIVE_2022  ACTIVE_STATUS_NOTES LOCATION_DATA_NOTES  
14525               0.0                  NaN                 NaN  

[1 rows x 84 columns]
TX0054445 all_wwtps NPDES ID: SIC code is
[4952, 4952]
TX0117587 biosolids NPDES ID: SIC code is


In [23]:
view_all_sic_codes('TX0071650')

Unnamed: 0,NPDES_ID,sic_cod_no,sic_facility_type,flag
0,TX0071650,1,sewer_system,REVIEW
1,TX0071650,2,sewer_system,REVIEW
2,TX0071650,3,sewer_system,REVIEW
3,TX0071650,4,sewer_system,REVIEW
4,TX0071650,5,drinking_water,REVIEW


In [8]:
# Exploring all_wwtps with Christina - 3/13 
import pandas as pd 
import pathlib
from utilities import load_all_wwtps_data

def load_all_wwtps_data():
    # I can add any data cleaning steps later
    relevant_wwtps_cols = ['FACILITY_CODE', 'SOURCE', 'FACILITY', 'CITY', 'STATE', 'LATITUDE', 'LONGITUDE', 'FLOW_2012_MGD',
                           '2012_TOT_ANNUAL_MM3', 'CWNS_NUM', 'NPDES_ID']
    all_wwtps = pd.read_csv(pathlib.PurePath('01_raw_data', 'all_wwtps_data.csv'), usecols=relevant_wwtps_cols,
                            low_memory=False)

    # Remove any rows that have 0 as NPDES_ID
    # all_wwtps = all_wwtps[all_wwtps['NPDES_ID'] != '0']

    return all_wwtps

all_wwtp = load_all_wwtps_data()
print(len(all_wwtp))

duplicates = all_wwtp[all_wwtp.duplicated(subset=['CWNS_NUM'], keep=False)]['FACILITY_CODE']
display(duplicates.unique())
print(len(duplicates))

16488


array(['IL02007', 'ND01347', 'PR00237', 'TX01517', 'AL02900', 'AL02905',
       'AL03005', 'AL03035', 'AL03080', 'AL03100', 'AL03130', 'AL03135',
       'AL03140', 'AL03145', 'AL03215', 'AL03240', 'AL03250', 'AR03905',
       'AR03920', 'AR04020', 'AZ01570', 'AZ01585', 'AZ01590', 'AZ01600',
       'AZ01610', 'AZ01615', 'AZ01620', 'CA01047', 'CA04960', 'CA04985',
       'CA05000', 'CA05010', 'CA05015', 'CA05025', 'CA05030', 'CA05050',
       'CA05060', 'CO02820', 'CO02830', 'CO02835', 'CO02840', 'FL03810',
       'FL03815', 'FL03820', 'FL03825', 'FL03830', 'FL03835', 'GA03450',
       'GA03455', 'GA03465', 'GA03480', 'GA03485', 'GA03490', 'GA03500',
       'GA03510', 'GA03515', 'GA03525', 'GA03565', 'GA03575', 'GA03600',
       'GA03605', 'GA03610', 'GA03615', 'GA03620', 'GA03625', 'GA03630',
       'GA03635', 'GA03645', 'GA03650', 'GA03660', 'GA03665', 'GA03680',
       'GA03690', 'GA03700', 'GA03725', 'GA03745', 'GA03775', 'IA07845',
       'IA07990', 'IL04660', 'IL04830', 'IL04870', 

339
