In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
import beneficialUseDictionary
from utilityFunctions import *

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "WaterRights_Diversion.csv"

# output water sources
out_put = "watersources.csv"

In [4]:
##### WaDE columns

columns=['WaterSourceUUID', 'WaterSourceNativeID',	'WaterSourceName', 'WaterSourceTypeCV',
         'WaterQualityIndicatorCV',	'GNISFeatureNameCV', 'Geometry']

dtypesx = ['BigInt	NVarChar(250)	NVarChar(250)	NVarChar(250)	NVarChar(100)	NVarChar(100)',
           'NVarChar(250)	Geometry']

In [5]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")


df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"

print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

#df100 = df100.replace('', np.nan)
df100.head(5)

Reading inputs...
13797


Unnamed: 0,WRKEY,BOCA_CD,WTR_ID,WR_NUMBER,WRTE_DESCR,WRTE_CD,WRST_DESCR,WRST_CD,VERS_ID_SEQ,VERS_TYPE,...,SCTN,QTR,GOVT_LOT,SPX,SPY,WELL_DPTH,RES,GEOCODES,DTM_CREATED,Unnamed: 46
0,200059-1,39E,113577,39E 113577 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,15.0,SWNESE,,,,,,4208591540101A400,20191212,
1,304174-1,39E,115446,39E 115446 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,25.0,SWSESW,,,,,,4208582530101A400,20191212,
2,304174-1,39E,115446,39E 115446 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,25.0,SWSESW,,,,,,4208582530101A400,20191212,
3,203253-1,39E,115458,39E 115458 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,35.0,NWNWNW,,,,,,,20191212,
4,203253-1,39E,115458,39E 115458 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,35.0,NWNWNW,,,,,,,20191212,


In [7]:
list(df100.columns)

['WRKEY',
 'BOCA_CD',
 'WTR_ID',
 'WR_NUMBER',
 'WRTE_DESCR',
 'WRTE_CD',
 'WRST_DESCR',
 'WRST_CD',
 'VERS_ID_SEQ',
 'VERS_TYPE',
 'ENF_PRTY_DT_DATE',
 'MAJOR_TYPE',
 'SOURCE_NAME',
 'MODV_DESCR',
 'PURT_DESCR',
 'PURT_CD',
 'LST_NM_OR_BUSN_NM',
 'FST_NM',
 'MID_INT',
 'SUFX',
 'OWNER',
 'MAX_FLOW',
 'MAX_VOL',
 'MAX_ACRES',
 'SCANNED',
 'DIV_CNT',
 'POU_CNT',
 'IRR_CNT',
 'RESV_CNT',
 'ISSUE_RMK',
 'DITCH',
 'HIST_TYPE',
 'POD_NO',
 'COUNTY',
 'CNTY_CD',
 'COUNTY_ST_CD',
 'TR',
 'SCTN',
 'QTR',
 'GOVT_LOT',
 'SPX',
 'SPY',
 'WELL_DPTH',
 'RES',
 'GEOCODES',
 'DTM_CREATED',
 'Unnamed: 46']

In [9]:
print ("Source name and type")

def mapSource(inString1):
    inString = str(inString1).strip()
    try:
        if inString == 'S':
            return "Surface"
        elif inString == 'G':
            return "Ground"
        else:
            return 'Unknown'        
    except:
        return 'Unknown'
        
            
df100 = df100.assign(WaterSourceTypeCV='')

df100['WaterSourceTypeCV'] = df100.apply(lambda row: mapSource(row['MAJOR_TYPE']), axis=1)
df100

destCols=['WaterSourceName','WaterSourceTypeCV']
srsCols=['SOURCE_NAME', 'WaterSourceTypeCV']

outdf100[destCols] = df100[srsCols]

Source name and type


In [10]:
print("Hard coded ...")

outdf100.WaterQualityIndicatorCV = "Fresh"

Hard coded ...


In [11]:
print("Dropping duplicates...")

print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['WaterSourceName','WaterSourceTypeCV']) #,'WaterQualityIndicatorCV'
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

outdf100

Dropping duplicates...
13797
1632


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,,,GROUNDWATER,Ground,Fresh,,
1,,,UNNAMED TRIBUTARY OF BOXELDER CREEK,Surface,Fresh,,
2,,,UNNAMED TRIBUTARY OF COW CREEK,Surface,Fresh,,
3,,,"SPRING, UNNAMED TRIBUTARY OF BOXELDER CREEK",Ground,Fresh,,
4,,,"SPRING, UNNAMED TRIBUTARY OF FLASTED CREEK",Ground,Fresh,,
5,,,"SPRING, UNNAMED TRIBUTARY OF NORTH THOMPSON CREEK",Ground,Fresh,,
6,,,"UNNAMED TRIBUTARY OF WILLOW CREEK, NORTH FORK",Surface,Fresh,,
7,,,"SPRING, UNNAMED TRIBUTARY OF WILLOW CREEK, NOR...",Ground,Fresh,,
8,,,"SPRING, UNNAMED TRIBUTARY OF LITTLE BEAVER CREEK",Ground,Fresh,,
9,,,"SPRING, UNNAMED TRIBUTARY OF SHEEP CAMP CREEK",Ground,Fresh,,


In [12]:
print ("Water source native id and UUID")

#9.12.19 Adel: For water sources table, how about we do an incremental ID? like 1, 2, 3 etc?
outdf100 = outdf100.reset_index(drop=True)
outdf100['WaterSourceNativeID'] = range(1, len(outdf100.index) + 1)

print("Adding UUID...")
outdf100['WaterSourceUUID'] = outdf100.apply(lambda row: 
                                        "_".join(["MT", str(row['WaterSourceNativeID'])]), axis=1)

outdf100 = outdf100.replace(np.nan, '')

outdf100

Water source native id and UUID
Adding UUID...


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,MT_1,1,GROUNDWATER,Ground,Fresh,,
1,MT_2,2,UNNAMED TRIBUTARY OF BOXELDER CREEK,Surface,Fresh,,
2,MT_3,3,UNNAMED TRIBUTARY OF COW CREEK,Surface,Fresh,,
3,MT_4,4,"SPRING, UNNAMED TRIBUTARY OF BOXELDER CREEK",Ground,Fresh,,
4,MT_5,5,"SPRING, UNNAMED TRIBUTARY OF FLASTED CREEK",Ground,Fresh,,
5,MT_6,6,"SPRING, UNNAMED TRIBUTARY OF NORTH THOMPSON CREEK",Ground,Fresh,,
6,MT_7,7,"UNNAMED TRIBUTARY OF WILLOW CREEK, NORTH FORK",Surface,Fresh,,
7,MT_8,8,"SPRING, UNNAMED TRIBUTARY OF WILLOW CREEK, NOR...",Ground,Fresh,,
8,MT_9,9,"SPRING, UNNAMED TRIBUTARY OF LITTLE BEAVER CREEK",Ground,Fresh,,
9,MT_10,10,"SPRING, UNNAMED TRIBUTARY OF SHEEP CAMP CREEK",Ground,Fresh,,


In [13]:
#write out
outdf100.to_csv(out_put, index=False, encoding = "utf-8")

print("Done watersources")

Done watersources
