In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
import beneficialUseDictionary
from utilityFunctions import *

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "Person_Plus_EXTRACT_FromWRTSnotGWIS.csv"

# output water sources
out_put = "watersources.csv"

In [4]:
##### WaDE columns

columns=['WaterSourceUUID', 'WaterSourceNativeID',	'WaterSourceName', 'WaterSourceTypeCV',
         'WaterQualityIndicatorCV',	'GNISFeatureNameCV', 'Geometry']

dtypesx = ['BigInt	NVarChar(250)	NVarChar(250)	NVarChar(250)	NVarChar(100)	NVarChar(100)',
           'NVarChar(250)	Geometry']

In [5]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")


df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"

print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

#df100 = df100.replace('', np.nan)
df100.head(5)

Reading inputs...
489112


Unnamed: 0,X,Y,OBJECTID,OBJECTID_1,WaRecID,WaRecId_1,WR_Doc_ID,WaRecPhaseId,WaRecPhasePartyRoleTypeCode,PersonLastOrOrganizationNM,...,PriorityDate,WaRecProcessStatusTypeCode,WaRecClaimTypeCode,WaRecPhaseTypeCode,WaRecPhaseStageTypeCode,InstantaneousQuantity,AnnualVolumeQuantity,IrrigatedAreaQuantity,InstantaneousUnitCode,PurposeOfUseTypeCodes
0,0,0,1,6666660,2132706,2132706,2132706,9971,Primary,WA Fish & Wildlife Dept - CRO,...,1973-12-07T00:00:00.000,Active,,Certificate,,12.0,,,CFS,FS
1,0,0,2,6809900,2132706,2132706,2132706,9971,Primary,WA Fish & Wildlife Dept - CRO,...,1973-12-07T00:00:00.000,Active,,Certificate,,12.0,,,CFS,FS
2,0,0,3,6811534,2132706,2132706,2132706,9971,Primary,WA Fish & Wildlife Dept - CRO,...,1973-12-07T00:00:00.000,Active,,Certificate,,12.0,,,CFS,FS
3,0,0,4,6813072,2132706,2132706,2132706,9971,Primary,WA Fish & Wildlife Dept - CRO,...,1973-12-07T00:00:00.000,Active,,Certificate,,12.0,,,CFS,FS
4,0,0,5,6814107,2132706,2132706,2132706,9971,Primary,WA Fish & Wildlife Dept - CRO,...,1973-12-07T00:00:00.000,Active,,Certificate,,12.0,,,CFS,FS


In [7]:
list(df100.columns)

['X',
 'Y',
 'OBJECTID',
 'OBJECTID_1',
 'WaRecID',
 'WaRecId_1',
 'WR_Doc_ID',
 'WaRecPhaseId',
 'WaRecPhasePartyRoleTypeCode',
 'PersonLastOrOrganizationNM',
 'PersonFirstNM',
 'PersonMINM',
 'PersonAddressLine1AD',
 'PersonAddressLine2AD',
 'PersonAddressLine3AD',
 'PersonAddressCityAD',
 'PersonAddressZipCodeAD',
 'WaRecRCWClassTypeCode',
 'EcologyRegionCode',
 'WaRecPrimaryNumber',
 'PriorityDate',
 'WaRecProcessStatusTypeCode',
 'WaRecClaimTypeCode',
 'WaRecPhaseTypeCode',
 'WaRecPhaseStageTypeCode',
 'InstantaneousQuantity',
 'AnnualVolumeQuantity',
 'IrrigatedAreaQuantity',
 'InstantaneousUnitCode',
 'PurposeOfUseTypeCodes']

In [8]:
print ("Directly mapped columns")

destCols=['WaterSourceTypeCV']
srsCols=['WaRecRCWClassTypeCode']

outdf100[destCols] = df100[srsCols]

Directly mapped columns


In [9]:
print("Hard coded ...")
outdf100.WaterSourceName = "Unspecified"
outdf100.WaterQualityIndicatorCV = "Fresh"

Hard coded ...


In [10]:
print("Dropping duplicates...")

print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['WaterSourceName','WaterSourceTypeCV']) #,'WaterQualityIndicatorCV'
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

outdf100

Dropping duplicates...
489112
4


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,,,Unspecified,surfaceWater,Fresh,,
1,,,Unspecified,groundwater,Fresh,,
2,,,Unspecified,reservoir,Fresh,,
3,,,Unspecified,,Fresh,,


In [11]:
print ("Water source native id and UUID")

#9.12.19 Adel: For water sources table, how about we do an incremental ID? like 1, 2, 3 etc?
outdf100 = outdf100.reset_index(drop=True)
outdf100['WaterSourceNativeID'] = range(1, len(outdf100.index) + 1)

print("Adding UUID...")
outdf100['WaterSourceUUID'] = outdf100.apply(lambda row: 
                                        "_".join(["WA", str(row['WaterSourceNativeID'])]), axis=1)

outdf100 = outdf100.replace(np.nan, '')

outdf100

Water source native id and UUID
Adding UUID...


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,WA_1,1,Unspecified,surfaceWater,Fresh,,
1,WA_2,2,Unspecified,groundwater,Fresh,,
2,WA_3,3,Unspecified,reservoir,Fresh,,
3,WA_4,4,Unspecified,,Fresh,,


In [13]:
print("Unknown source type")

outdf100.loc[outdf100['WaterSourceTypeCV']=='','WaterSourceTypeCV'] = 'Unknown'

outdf100

Unknown source type


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,WA_1,1,Unspecified,surfaceWater,Fresh,,
1,WA_2,2,Unspecified,groundwater,Fresh,,
2,WA_3,3,Unspecified,reservoir,Fresh,,
3,WA_4,4,Unspecified,Unknown,Fresh,,


In [14]:
#write out
outdf100.to_csv(out_put, index=False, encoding = "utf-8")
print("Done watersources")

Done watersources
