In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "EWRIMS MASTER FLAT FILE DATA DICTIONARY DRAFT 1-17-20.xlsx" 

# output water sources
out_put = "watersources.csv"

In [4]:
##### WaDE columns

columns=['WaterSourceUUID', 'WaterSourceNativeID',	'WaterSourceName', 'WaterSourceTypeCV',
         'WaterQualityIndicatorCV',	'GNISFeatureNameCV', 'Geometry']

dtypesx = ['BigInt	NVarChar(250)	NVarChar(250)	NVarChar(250)	NVarChar(100)	NVarChar(100)',
           'NVarChar(250)	Geometry']

In [5]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")

# sheet to read = "ewrims_flat_file"
df100 = pd.read_excel(fileInput1, header=0, sheet_name="ewrims_flat_file", skiprows=0, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"

df100 = df100.drop_duplicates()   #
print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

df100 = df100.replace(np.nan, '')
df100

Reading inputs...
57736


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,PETITION_STATUS_TYPE,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE
0,0,,,,,Not Determined,,,,,...,,,,,,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,,,,,,0
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,,,,,,0
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,,,,,,0
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,Pending,1578441600000000000,,1578564981000000000,34,,,,,0
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,,,,,,,0
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,Completed,1367798400000000000,1372636800000000000,1372778637000000000,4,,,,,0
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,,,,,,,0
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,,,,,,,0
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,,,,,,,0


In [7]:
list(df100.columns)

['WR_WATER_RIGHT_ID',
 'APPLICATION_NUMBER',
 'CERTIFICATE_ID',
 'PERMIT_ID',
 'LICENSE_ID',
 'WATER_RIGHT_TYPE',
 'WATER_RIGHT_STATUS',
 'APPLICATION_NUMBER_PARTY',
 'PWSS_ID',
 'PRIORITY_DATE',
 'RECEIPT_DATE',
 'REJECTION_DATE',
 'APPLICATION_RECD_DATE',
 'APPLICATION_ACCEPTANCE_DATE',
 'PROJECT_TYPE',
 'RECORD_SUMMARY',
 'INCOMPLETE_STATEMENT',
 'NUMBER_OF_PROTESTS',
 'AGENT_NAME',
 'AGENT_ENTITY_TYPE',
 'APPLICATION_PRIMARY_OWNER',
 'PRIMARY_OWNER_ENTITY_TYPE',
 'SUB_TYPE',
 'INI_REPORTED_DIV_AMOUNT',
 'INI_REPORTED_DIV_UNIT',
 'FACE_VALUE_AMOUNT',
 'FACE_VALUE_UNITS',
 'FEE_DUE',
 'FEE_RECEIVED',
 'APPL_FEE_AMOUNT',
 'APPL_FEE_AMT_RECD',
 'MAX_DD_APPL',
 'MAX_DD_UNITS',
 'MAX_DD_ANN',
 'MAX_STORAGE',
 'MAX_TAKEN_FROM_SOURCE',
 'YEAR_DIVERSION_COMMENCED',
 'MAX_BENEFICIALLY_USED',
 'SUPPLEMENTAL_STATEMENT_CYCLE',
 'TYPE_OF_DIVERSION_FACILITY',
 'QUANTITY_OF_WATER_DIVERTED',
 'QOW_DIVERTED_UNIT',
 'QUANTITY_MEASUREMENT_YEAR',
 'MAX_RATE_OF_DIVERSION',
 'MAX_RATE_OF_DIV_UNIT',
 'REC

In [8]:
print ("Directly mapped columns")

destCols=['WaterSourceName', 'WaterSourceTypeCV']
srsCols=['SOURCE_NAME', 'SOURCE_TYPE']

outdf100[destCols] = df100[srsCols]

Directly mapped columns


In [9]:
print("Hard coded ...")
outdf100.WaterQualityIndicatorCV = "Fresh"

outdf100.loc[outdf100['WaterSourceName']=='','WaterSourceName'] = "Unspecified"
outdf100.loc[outdf100['WaterSourceTypeCV']=='','WaterSourceTypeCV'] = "Unknown"

Hard coded ...


In [10]:
print("Dropping duplicates...")

print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['WaterSourceName']) #,'WaterQualityIndicatorCV'
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

outdf100

Dropping duplicates...
57736
8619


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,,,Unspecified,Unknown,Fresh,,
1,,,LAKE DOMINGO,Surface,Fresh,,
2,,,ICY SPRING NO 2,Unknown,Fresh,,
3,,,SACRAMENTO RIVER,Unknown,Fresh,,
4,,,SAN JOAQUIN RIVER,Unknown,Fresh,,
5,,,ALDER CREEK,Unknown,Fresh,,
6,,,UNCR,Unknown,Fresh,,
7,,,LEE VINING CREEK,Unknown,Fresh,,
8,,,RUSH CREEK,Unknown,Fresh,,
9,,,WILLOW CREEK,Unknown,Fresh,,


In [11]:
print ("Water source native id and UUID")

#9.12.19 Adel: For water sources table, how about we do an incremental ID? like 1, 2, 3 etc?
outdf100 = outdf100.reset_index(drop=True)
outdf100['WaterSourceNativeID'] = range(1, len(outdf100.index) + 1)

print("Adding UUID...")
outdf100['WaterSourceUUID'] = outdf100.apply(lambda row: 
                                        "_".join(["CA", str(row['WaterSourceNativeID'])]), axis=1)

outdf100 = outdf100.replace(np.nan, '')

outdf100

Water source native id and UUID
Adding UUID...


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,CA_1,1,Unspecified,Unknown,Fresh,,
1,CA_2,2,LAKE DOMINGO,Surface,Fresh,,
2,CA_3,3,ICY SPRING NO 2,Unknown,Fresh,,
3,CA_4,4,SACRAMENTO RIVER,Unknown,Fresh,,
4,CA_5,5,SAN JOAQUIN RIVER,Unknown,Fresh,,
5,CA_6,6,ALDER CREEK,Unknown,Fresh,,
6,CA_7,7,UNCR,Unknown,Fresh,,
7,CA_8,8,LEE VINING CREEK,Unknown,Fresh,,
8,CA_9,9,RUSH CREEK,Unknown,Fresh,,
9,CA_10,10,WILLOW CREEK,Unknown,Fresh,,


In [12]:
#write out
outdf100.to_csv(out_put, index=False, encoding = "utf-8")
print("Done watersources")

Done watersources
