# Pre-processing Oregon Allocation data for WaDE upload.

Purpose:  To pre-process the Oregon data into one master file for simple DataFrame creation and extraction

Useful Links to Data:

- Data Avalaible (use 'Statewide Water Right Spatial Data with Metadata'): https://www.oregon.gov/OWRD/access_Data/Pages/Data.aspx

- POD metadata: https://arcgis.wrd.state.or.us/data/wr_pod_metadata.pdfPOD

- POU metadata: https://arcgis.wrd.state.or.us/data/wr_pou_metadata.pdfPOD

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Oregon/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Point of Diversoin Data

In [3]:
inputFile = "ORwr_v_pod_public_input.zip"
dfinPOD = pd.read_csv(inputFile, encoding = "ISO-8859-1")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "orD" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('ORwr_v_pod_public_input.zip', compression=dict(method='zip', archive_name='ORwr_v_pod_public_input.csv'), index=False)

dfinPOD = dfinPOD.drop_duplicates().reset_index(drop=True)
print(len(dfinPOD))
dfinPOD.head()

  dfinPOD = pd.read_csv(inputFile, encoding = "ISO-8859-1")


189822


Unnamed: 0,WaDEUUID,ReasonRemoved,IncompleteField,OBJECTID,snp_id,app_nbr,pod_display,permit_nbr,acre_feet,acre_feet_est,agency,app_char,begin_day,begin_month,cert_nbr,claim_char,claim_nbr,decree_title,duty,end_day,end_month,feature_quality_code,last_updt_date,max_rate_acre_feet,max_rate_cfs,name_company,name_first,name_last,permit_char,pod_char,pod_display_short,pod_location_id,pod_nbr,pod_use_id,POINT_X,POINT_Y,priority_date,rate_cfs,rate_cfs_est,rec_creation_date,remarks,source,source_type,stream_name,streamcode,supplemental,technician_initials,transfer_nbr,tributary_to,use_category,use_code,use_code_description,field_51,wr_type,wris_link,geometry
0,orD0,,,1,21755,11987.0,Permit: G 10961 * MI,10961.0,,0,OWRD,G,1.0,1.0,,,,,,31.0,12.0,,6/1/1996 0:00,,0.04,FORMOSA EXPLORATION INC.,,,G,,G 10961,6909,1,26859,539412.5499,416705.7999,11/21/1989 0:00,0.04,0,6/1/1996 0:00,0 G 10961 1,FORMOSA 1 ADIT ...,WE,UNN STR > MIDDLE CR,1.61e+19,0,MIGRT,,CANYON CREEK,0,MI,MINING,,GW,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,
1,orD1,,,2,21755,11987.0,Permit: G 10961 * MI,10961.0,,0,OWRD,G,1.0,1.0,,,,,,31.0,12.0,,6/1/1996 0:00,,0.005,FORMOSA EXPLORATION INC.,,,G,,G 10961,6910,2,26860,539232.9167,416251.9918,11/21/1989 0:00,0.005,0,6/1/1996 0:00,0 G 10961 2,SILVER BUTTE 1 ADIT ...,WE,UNN STR > MIDDLE CR,1.61e+19,0,MIGRT,,CANYON CREEK,0,MI,MINING,,GW,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,
2,orD10,,,11,24155,14637.0,Permit: G 13525 * MI,13525.0,,0,ESU,G,1.0,1.0,,,,,,31.0,12.0,,10/17/2003 0:00,,0.1,PARKIN FAMILY LLC,,,G,,G 13525,10776,1,31422,622301.2333,1396809.583,11/14/1997 0:00,0.1,0,10/17/2003 0:00,,A WELL ...,WE,RODERICK CR > GALES CR,2110000000000000.0,0,RL,,RODERICK CREEK,0,MI,MINING,,GW,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,
3,orD100,,,101,56343,8486.0,Cert:3967 OR * MI,5503.0,,0,OWRD,S,1.0,1.0,3967.0,,,,,31.0,12.0,,6/1/2001 0:00,,2.5,,G A,BAKER,S,,3967,53167,1,62463,556503.1083,304686.7418,7/1/1922 0:00,2.5,0,6/1/2001 0:00,3967 S 5503 1,COVE CREEK ...,ST,COVE CR > JUMPOFF JOE CR,15200000000000.0,0,MIGRT,,JUMPOFF JOE CREEK,0,MI,MINING,,SW,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,
4,orD1000,,,1001,23642,12797.0,Permit: G 13007 * CR,13007.0,,0,OWRD,G,1.0,1.0,,,,,,31.0,12.0,30.0,2/26/2020 7:57,,0.5,JACKSON FAMILY TRUST,JAMES,JACKSON,G,,G 13007,9919,4,30338,282398.8333,522242.3917,3/2/1992 0:00,0.0951,1,2/26/2020 0:00,COPIED FROM EXISTING DATA,A SUMP,SM,CEDAR CR > BEAR CR,17200000000000.0,0,DAM,,BEAR CREEK BASIN,1,CR,CRANBERRY,,GW,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,


In [4]:
# For creating WaterSourceTypeCV
WSTypeDict = {
    "ST": "Storage",
    "GW": "Groundwater",
    "SW": "Surface Water"}
def assignWaterSourceTypeCV(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        try:
            outList = WSTypeDict[colrowValue]
        except:
            outList = "WaDE Unspecified"

    return outList

dfinPOD['in_WaterSourceTypeCV'] = dfinPOD.apply(lambda row: assignWaterSourceTypeCV(row['wr_type']), axis=1)
dfinPOD['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water', 'Storage'], dtype=object)

In [5]:
# For converting projection latitude.
from pyproj import Transformer, transform
transformer = Transformer.from_proj(2992, 4326)

# For converting projection latitude.
def assignLat(colrowValueLat, colrowValueLong):
    lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return lat

# For converting projection longitude.
def assignLong(colrowValueLat, colrowValueLong):
    lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return long

dfinPOD['in_Latitude'] = dfinPOD.apply(lambda row: assignLat(row['POINT_X'], row['POINT_Y']), axis=1)
dfinPOD['in_Longitude'] = dfinPOD.apply(lambda row: assignLong(row['POINT_X'], row['POINT_Y']), axis=1)

In [6]:
# For creating Site Name
def assignSiteName(colrowValueA, colrowValueB):
    if (colrowValueA == '' and colrowValueB == '') or (pd.isnull(colrowValueA) and pd.isnull(colrowValueB)):
        outList = "WaDE Unspecified"
    else:
        A = str(colrowValueA).strip()
        B = str(colrowValueB).strip()
        outList = A + "_" + B
    return outList

dfinPOD['in_SiteName'] = dfinPOD.apply(lambda row: assignSiteName(row['snp_id'], row['pod_nbr']), axis=1)

In [7]:
# For creating SiteTypeCV
STCVDict = {
"LK" : "lake",
"DR" : "drain",
"SP" : "spring",
"ST" : "stream",
"SL" : "slough",
"WW" : "waste water",
"WE" : "well",
"WR" : "winter runoff",
"SM" : "sump",
"PD" : "pond",
"RS" : "reservoir",
"DT" : "ditch",
"SE" : "sewage effluent",
"CN" : "canal"}
def assignSiteTypeCV(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = colrowValue.strip()
        try:
            outList = STCVDict[String1]
        except:
            outList = "WaDE Unspecified"
    return outList

dfinPOD['in_SiteTypeCV'] = dfinPOD.apply(lambda row: assignSiteTypeCV(row['source_type']), axis=1)
dfinPOD['in_SiteTypeCV'].unique()

array(['well', 'stream', 'sump', 'waste water', 'reservoir', 'canal',
       'spring', 'lake', 'pond', 'winter runoff', 'ditch', 'drain',
       'sewage effluent', 'slough'], dtype=object)

In [8]:
# Changing datatype of used date fields. 
dfinPOD['priority_date'] = pd.to_datetime(dfinPOD['priority_date'], errors = 'coerce')
dfinPOD['priority_date'] = pd.to_datetime(dfinPOD["priority_date"].dt.strftime('%m/%d/%Y'))

In [9]:
# Creating Ownername.
# Concatenating first and last name of individual.
# Determining if company is available, split string.
# combine together for output.

import re

# first & last name funciton
def assignownerName(colrowValue1, colrowValue2):
    if colrowValue1 == '' or pd.isnull(colrowValue1):
        outList1 = ''
    else:
        outList1 = colrowValue1.strip()  # remove whitespace chars
    if colrowValue2 == '' or pd.isnull(colrowValue2):
        outList2 = ''
    else:
        outList2 = colrowValue2.strip()  # remove whitespace chars

    if outList1 == '' and outList2 == '':
        outList = ''
    elif outList1 == '':
        outList = outList2
    elif outList2 == '':
        outList = outList1
    else:
        outList = " ".join(map(str, [colrowValue1, colrowValue2]))
    return outList


# Business name and Concatenate
def assignownerNameORCompany(buisName, fName, lName):
    
    # Concatenating First and Last name together.
    frilasName = assignownerName(fName, lName)
    
    # Clearn Company Name Entry
    if buisName == "" or pd.isnull(buisName):
        outBuisString = ""
    else:
        buisName = str(buisName).strip()
        if ";" in buisName:
            xList = buisName.split(";")
            for index, item in enumerate(xList):
                if "," in item:
                    list1 = item.split(",")
                    list1.reverse()
                    xList[index] = "".join(list1)
                else:
                    xList[index] = item
            outBuisString = ",".join(xList)
        elif "," in buisName:
            xList = buisName.split(",")
            outBuisString = str(xList[0]).strip() + "," + str(xList[1]).strip()
        else:
            outBuisString = buisName
    
    #Concatenating together, create outString
    if frilasName == ""  or pd.isnull(frilasName):
        if outBuisString == ""  or pd.isnull(outBuisString):
            outString = "Unspecified"
        else:
            outString = outBuisString
    else:
        if outBuisString == ""  or pd.isnull(outBuisString):
            outString = frilasName
        else:
            outString = frilasName + ", " + outBuisString
        
    outString = outString.strip()
    outString = re.sub("[$@&.;,/\)(-]", "", outString).title().strip()
    
    return outString

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: assignownerNameORCompany(row['name_company'], row['name_first'], row['name_last']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

array(['Formosa Exploration Inc', 'Parkin Family Llc', 'G A Baker', ...,
       'River Rock Vineyard', 'Marjorie Baird', 'Kennerly Ranches Llc'],
      dtype=object)

In [10]:
#Determining AllocationTimeframe Start & End time for each site.

def formatDateString(inString1, inString2):
    #print(inString)
    try:
        valndf = str(int(inString1)).strip() + '/' + str(int(inString2)).strip()
    except:
        valndf = ''

    return valndf;

dfinPOD['in_AllocationTimeframeStart'] = dfinPOD.apply(lambda row: formatDateString(row['begin_month'], row['begin_day']), axis=1)
dfinPOD['in_AllocationTimeframeEnd'] = dfinPOD.apply(lambda row: formatDateString(row['end_month'], row['end_day']), axis=1)

In [11]:
#Fixing Beneficial Uses PRIMARY_PURPOSE

def fixBenUse(val):
    val = str(val).strip()
    if val == "IRRIGATION, LIVESTOCK AND DOMESTIC":
        outString = "IRRIGATION, LIVESTOCK, DOMESTIC"
    elif val == "IRRIGATION AND LIVESTOCK":
        outString = "IRRIGATION, LIVESTOCK"
    elif val == "LIVESTOCK AND WILDLIFE":
        outString = "LIVESTOCK, WILDLIFE"
    else:
        outString = val
    return outString

dfinPOD['use_code_description'] = dfinPOD.apply(lambda row: fixBenUse(row['use_code_description']), axis=1)
dfinPOD['use_code_description'].unique()

array(['MINING', 'CRANBERRY', 'DOMESTIC EXPANDED', 'IRRIGATION',
       'SUPPLEMENTAL IRRIGATION', 'DOMESTIC', 'GROUP DOMESTIC',
       'DOMESTIC INCLUDING LAWN AND GARDEN', 'AGRICULTURE USES',
       'USE WITHIN A SCHOOL', 'DOMESTIC AND LIVESTOCK',
       'IRRIGATION OF CRANBERRIES', 'TEMPERATURE CONTROL', 'NURSERY USES',
       'HUMAN CONSUMPTION', 'COMMERCIAL USES',
       'INDUSTRIAL/MANUFACTURING USES', 'GEO-THERMAL (HEATING & COOLING)',
       'SAWMILL', 'LOG DECK SPRINKLING', 'FROST PROTECTION', 'SHOP',
       'LABORATORY', 'RECREATION', 'CAMPSITE', 'SWIMMING', 'RAM',
       'POWER DEVELOPMENT', 'GEO-THERMAL(ENERGY PRODUCTION)',
       'FISH CULTURE', 'FISH AND WILDLIFE', 'AQUACULTURE', 'LIVESTOCK',
       'GREENHOUSE', 'USE IN A MINT STILL', 'LIVESTOCK, WILDLIFE',
       'DAIRY BARN USES', 'HARVESTING OF CRANBERRIES',
       'SUPPLEMENTAL FLOOD HARVESTING', 'MUNICIPAL USES',
       'QUASI-MUNICIPAL USES', 'FISHERY ENHANCEMENT (INSTREAM)',
       'MULTIPLE INSTREAM USES',
      

In [12]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "ORwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "ORwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "ORwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['source']
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfinPOD['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Unspecified"
df['in_CoordinateMethodCV'] = "WaDE Unspecified"
df['in_County'] = "WaDE Unspecified"
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['in_Latitude']
df['in_Longitude'] = dfinPOD['in_Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfinPOD['in_SiteName']
df['in_SiteNativeID'] = "POD" + dfinPOD['pod_location_id'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOD['in_SiteTypeCV']
df['in_StateCV'] = "OR"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = dfinPOD['duty']
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['rate_cfs']
df['in_AllocationLegalStatusCV'] = "WaDE Unspecified"
df['in_AllocationNativeID'] =  dfinPOD['snp_id'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOD['priority_date']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfinPOD['in_AllocationTimeframeEnd']
df['in_AllocationTimeframeStart'] = dfinPOD['in_AllocationTimeframeStart']
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOD['acre_feet']
df['in_BeneficialUseCategory'] = dfinPOD['use_code_description']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOD['wris_link']  #for WaterAllocationNativeURL

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True).replace(np.nan, '')
print(len(outPOD))
outPOD.head()

189822


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,orD0,ORwr_M1,ORwr_V1,ORwr_O1,,,,FORMOSA 1 ADIT ...,,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.85581,-123.38288,,,POD,21755_1,POD6909,,well,OR,,,,,,,,,,0.04,WaDE Unspecified,21755,Formosa Exploration Inc,1989-11-21,,,,,,MINING,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
1,orD1,ORwr_M1,ORwr_V1,ORwr_O1,,,,SILVER BUTTE 1 ADIT ...,,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.85455,-123.38349,,,POD,21755_2,POD6910,,well,OR,,,,,,,,,,0.005,WaDE Unspecified,21755,Formosa Exploration Inc,1989-11-21,,,,,,MINING,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
2,orD10,ORwr_M1,ORwr_V1,ORwr_O1,,,,A WELL ...,,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.55087,-123.19385,,,POD,24155_1,POD10776,,well,OR,,,,,,,,,,0.1,WaDE Unspecified,24155,Parkin Family Llc,1997-11-14,,,,,,MINING,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
3,orD100,ORwr_M1,ORwr_V1,ORwr_O1,,,,COVE CREEK ...,,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.5503,-123.30492,,,POD,56343_1,POD53167,,stream,OR,,,,,,,,,,2.5,WaDE Unspecified,56343,G A Baker,1922-07-01,,,,,,MINING,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
4,orD1000,ORwr_M1,ORwr_V1,ORwr_O1,,,,A SUMP,,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,43.11618,-124.35876,,,POD,23642_4,POD9919,,sump,OR,,,,,,,,,,0.0951,WaDE Unspecified,23642,James Jackson Jackson Family Trust,1992-03-02,,,,,,CRANBERRY,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...


## Place of Use Data

In [13]:
inputFile = "ORwr_v_pou_public_input.zip"
dfinPOU = pd.read_csv(inputFile, encoding = "ISO-8859-1")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOU:
    dfinPOU['WaDEUUID'] = "orU" + dfinPOU.index.astype(str)
    dfinPOU.to_csv('ORwr_v_pou_public_input.zip', compression=dict(method='zip', archive_name='ORwr_v_pou_public_input.csv'), index=False)

dfinPOU = dfinPOU.drop_duplicates().reset_index(drop=True)
print(len(dfinPOU))
dfinPOU.head()

109576


Unnamed: 0,WaDEUUID,ReasonRemoved,IncompleteField,OID_,snp_id,agency,app_char,app_nbr,cert_nbr,claim_char,claim_nbr,decree_title,delta_size,feature_quality_code,last_updt_date,Latitude,Longitude,name_company,name_first,name_last,permit_char,permit_nbr,pou_display,pou_display_short,pou_use_id,priority_date,rec_creation_date,remarks,Shape_Area,Shape_Length,supplemental,technician_initials,transfer_nbr,use_category,use_code,use_code_description,wr_type,wris_acres,wris_link
0,orU0,,,1,5135,OWRD,P,81441.0,,,,,,30.0,7/22/2005 8:02,43.73686,-118.36448,DASH W BAR RANCH,TERRY,WILLIAMS,,,App: P 81441 * LV,P 81441,4124,8/7/1996 0:00,6/30/2005 0:00,PLACED USING DRG,0.0,0.00894,0,MW,,8,LV,LIVESTOCK,ST,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
1,orU1,,,2,6333,OWRD,P,82980.0,,,,,,30.0,4/11/2016 10:48,42.44149,-123.04144,,SOFIA,PARKER,,,App: P 82980 * ST,P 82980,5886,1/7/1997 0:00,4/11/2016 0:00,PLACED USING 2014 IMAGERY,0.0,0.00456,0,BRW,,M,ST,STORAGE,ST,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
2,orU10,,,11,17085,ESU,,,,,,Umatilla River,,,5/28/2003 0:00,45.63549,-118.81561,,NORMAN,OVERSTREET,,,Inchoate: T 7524 CF (REG) * I*,T 7524,7789,12/31/1894 0:00:00,5/28/2003 0:00,,0.0,0.00367,0,RL,T 7524,3,I*,"IRRIGATION, LIVESTOCK AND DOMESTIC",SW,1.29,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
3,orU100,Incomplete or bad entry for AllocationCropDuty...,"0.0,2.5,4.0",101,17652,OWRD,G,5066.0,,,,,,10.0,,44.75319,-122.89916,,DENNIS,KOENIG,G,4776.0,Inchoate: T 7449 CF (REG) * IR,T 7449,8430,12/31/1969 0:00,10/26/2007 0:00,Automapped at the center of the PLS quarter-qu...,0.0,0.00191,0,KLS,T 7449,3,IR,IRRIGATION,GW,16.2,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
4,orU1000,Incomplete or bad entry for IrrigatedAcreage,"0.0,37.49",1001,22643,OWRD,G,13276.0,,,,,,10.0,,45.45604,-122.33948,CASCADE MEADOWS NURSERY,MARK,EISENZIMMER,G,11981.0,Permit: G 11981 * AG,G 11981,12928,1/29/1993 0:00,10/26/2007 0:00,Automapped at the center of the PLS quarter-qu...,0.0,0.00064,0,KLS,,1,AG,AGRICULTURE USES,GW,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...


In [14]:
# POU data is missing key inputs, will combine with POD data to fill in missing gaps.
dfinPOU['snp_id'] = dfinPOU['snp_id'].astype(str)  #for AllocationNativeID

dfinPOU = pd.merge(dfinPOU, outPOD, left_on='snp_id', right_on='in_AllocationNativeID', how='left')
print(len(dfinPOU))
dfinPOU.head()

307014


Unnamed: 0,WaDEUUID_x,ReasonRemoved,IncompleteField,OID_,snp_id,agency,app_char,app_nbr,cert_nbr,claim_char,claim_nbr,decree_title,delta_size,feature_quality_code,last_updt_date,Latitude,Longitude,name_company,name_first,name_last,permit_char,permit_nbr,pou_display,pou_display_short,pou_use_id,priority_date,rec_creation_date,remarks,Shape_Area,Shape_Length,supplemental,technician_initials,transfer_nbr,use_category,use_code,use_code_description,wr_type,wris_acres,wris_link,WaDEUUID_y,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,orU0,,,1,5135,OWRD,P,81441.0,,,,,,30.0,7/22/2005 8:02,43.73686,-118.36448,DASH W BAR RANCH,TERRY,WILLIAMS,,,App: P 81441 * LV,P 81441,4124,8/7/1996 0:00,6/30/2005 0:00,PLACED USING DRG,0.0,0.00894,0,MW,,8,LV,LIVESTOCK,ST,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,orD118191,ORwr_M1,ORwr_V1,ORwr_O1,,,,RUNOFF ...,,Storage,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326.0,,,,43.73573,-118.36361,,,POD,5135_1,POD26615,,winter runoff,OR,,,,,,,,,,,WaDE Unspecified,5135,Terry Williams Dash W Bar Ranch,1996-08-07,,12/31,1/1,,2.4,LIVESTOCK,,,,,,0.0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
1,orU0,,,1,5135,OWRD,P,81441.0,,,,,,30.0,7/22/2005 8:02,43.73686,-118.36448,DASH W BAR RANCH,TERRY,WILLIAMS,,,App: P 81441 * LV,P 81441,4124,8/7/1996 0:00,6/30/2005 0:00,PLACED USING DRG,0.0,0.00894,0,MW,,8,LV,LIVESTOCK,ST,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,orD179500,ORwr_M1,ORwr_V1,ORwr_O1,,,,RUNOFF ...,,Storage,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326.0,,,,43.73573,-118.36361,,,POD,5135_1,POD26615,,winter runoff,OR,,,,,,,,,,,WaDE Unspecified,5135,Terry Williams Dash W Bar Ranch,1996-08-07,,12/31,1/1,,2.4,WILDLIFE,,,,,,0.0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
2,orU1,,,2,6333,OWRD,P,82980.0,,,,,,30.0,4/11/2016 10:48,42.44149,-123.04144,,SOFIA,PARKER,,,App: P 82980 * ST,P 82980,5886,1/7/1997 0:00,4/11/2016 0:00,PLACED USING 2014 IMAGERY,0.0,0.00456,0,BRW,,M,ST,STORAGE,ST,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,orD162197,ORwr_M1,ORwr_V1,ORwr_O1,,,,A SPRING,,Storage,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326.0,,,,42.44191,-123.04151,,,POD,6333_1,POD29464,,spring,OR,,,,,,,,,,,WaDE Unspecified,6333,Sofia Parker,1997-01-07,,12/31,1/1,,51000.0,STORAGE,,,,,,0.0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
3,orU10,,,11,17085,ESU,,,,,,Umatilla River,,,5/28/2003 0:00,45.63549,-118.81561,,NORMAN,OVERSTREET,,,Inchoate: T 7524 CF (REG) * I*,T 7524,7789,12/31/1894 0:00:00,5/28/2003 0:00,,0.0,0.00367,0,RL,T 7524,3,I*,"IRRIGATION, LIVESTOCK AND DOMESTIC",SW,1.29,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,orD27200,ORwr_M1,ORwr_V1,ORwr_O1,,,,MCKAY CREEK ...,,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326.0,,,,45.63552,-118.81744,,,POD,17085_1,POD148,,stream,OR,,,,,,,,,,0.009,WaDE Unspecified,17085,Norman Overstreet,1894-12-31,,12/31,1/1,,,"IRRIGATION, LIVESTOCK, DOMESTIC",,,,,,0.0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
4,orU10,,,11,17085,ESU,,,,,,Umatilla River,,,5/28/2003 0:00,45.63549,-118.81561,,NORMAN,OVERSTREET,,,Inchoate: T 7524 CF (REG) * I*,T 7524,7789,12/31/1894 0:00:00,5/28/2003 0:00,,0.0,0.00367,0,RL,T 7524,3,I*,"IRRIGATION, LIVESTOCK AND DOMESTIC",SW,1.29,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,orD27201,ORwr_M1,ORwr_V1,ORwr_O1,,,,MCKAY CREEK ...,,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326.0,,,,45.63484,-118.81744,,,POD,17085_2,POD149,,stream,OR,,,,,,,,,,0.007,WaDE Unspecified,17085,Norman Overstreet,1894-12-31,,12/31,1/1,,,"IRRIGATION, LIVESTOCK, DOMESTIC",,,,,,0.0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...


In [15]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOU['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "ORwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "ORwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "ORwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOU['in_WaterSourceName']
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfinPOU['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Unspecified"
df['in_CoordinateMethodCV'] = "WaDE Unspecified"
df['in_County'] = "WaDE Unspecified"
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOU['Latitude']
df['in_Longitude'] = dfinPOU['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POU"
df['in_SiteName'] = "WaDE Unspecified"
df['in_SiteNativeID'] = "POU" + dfinPOU['pou_use_id'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE Unspecified"
df['in_StateCV'] = "OR"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = dfinPOU['in_AllocationCropDutyAmount']
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOU['in_AllocationFlow_CFS']
df['in_AllocationLegalStatusCV'] = "WaDE Unspecified"
df['in_AllocationNativeID'] =  dfinPOU['snp_id'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_AllocationOwner'] = dfinPOU['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOU['priority_date']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfinPOU['in_AllocationTimeframeEnd']
df['in_AllocationTimeframeStart'] = dfinPOU['in_AllocationTimeframeStart']
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOU['in_AllocationVolume_AF']
df['in_BeneficialUseCategory'] = dfinPOU['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOU['wris_acres']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOU['in_WaterAllocationNativeURL']  #for WaterAllocationNativeURL

outPOU = df.copy()
outPOU = outPOU.drop_duplicates().reset_index(drop=True).replace(np.nan, '')
print(len(outPOU))
outPOU.head()

257829


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,orU0,ORwr_M1,ORwr_V1,ORwr_O1,,,,RUNOFF ...,,Storage,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,43.73686,-118.36448,,,POU,WaDE Unspecified,POU4124,,WaDE Unspecified,OR,,,,,,,,,,,WaDE Unspecified,5135,Terry Williams Dash W Bar Ranch,8/7/1996 0:00,,12/31,1/1,,2.4,LIVESTOCK,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
1,orU0,ORwr_M1,ORwr_V1,ORwr_O1,,,,RUNOFF ...,,Storage,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,43.73686,-118.36448,,,POU,WaDE Unspecified,POU4124,,WaDE Unspecified,OR,,,,,,,,,,,WaDE Unspecified,5135,Terry Williams Dash W Bar Ranch,8/7/1996 0:00,,12/31,1/1,,2.4,WILDLIFE,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
2,orU1,ORwr_M1,ORwr_V1,ORwr_O1,,,,A SPRING,,Storage,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.44149,-123.04144,,,POU,WaDE Unspecified,POU5886,,WaDE Unspecified,OR,,,,,,,,,,,WaDE Unspecified,6333,Sofia Parker,1/7/1997 0:00,,12/31,1/1,,51000.0,STORAGE,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
3,orU10,ORwr_M1,ORwr_V1,ORwr_O1,,,,MCKAY CREEK ...,,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.63549,-118.81561,,,POU,WaDE Unspecified,POU7789,,WaDE Unspecified,OR,,,,,,,,,,0.009,WaDE Unspecified,17085,Norman Overstreet,12/31/1894 0:00:00,,12/31,1/1,,,"IRRIGATION, LIVESTOCK, DOMESTIC",,,,,,0,,1.29,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
4,orU10,ORwr_M1,ORwr_V1,ORwr_O1,,,,MCKAY CREEK ...,,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.63549,-118.81561,,,POU,WaDE Unspecified,POU7789,,WaDE Unspecified,OR,,,,,,,,,,0.007,WaDE Unspecified,17085,Norman Overstreet,12/31/1894 0:00:00,,12/31,1/1,,,"IRRIGATION, LIVESTOCK, DOMESTIC",,,,,,0,,1.29,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...


## Concatenate POD and POU data

In [16]:
# Concatenate dataframes
frames = [outPOD, outPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

447651


## Custom WaDE Elements due to missing info

In [17]:
# Clean owner name up
def cleanOwnerDataFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().strip()
    return Val
outdf['in_AllocationOwner'] = outdf.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Formosa Exploration Inc', 'Parkin Family Llc', 'G A Baker', ...,
       'Marjorie Baird', 'Kennerly Ranches Llc', ''], dtype=object)

In [18]:
# Fixing empty string names

def fixEmptyString(val):
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [19]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['FORMOSA 1 ADIT                                    ',
       'SILVER BUTTE 1 ADIT                               ',
       'A WELL                                            ', ...,
       'WEST BRANCH OF CROCKETT BRANCH OF LITTLE WALLA WAL',
       'EAST LITTLE WALLA WALLA RIVER', 'WaDE Unspecified'], dtype=object)

In [20]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water', 'Storage', 'WaDE Unspecified'],
      dtype=object)

In [21]:
outdf['in_SiteName'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['21755_1', '21755_2', '24155_1', ..., '193458_2', '193460_1',
       'WaDE Unspecified'], dtype=object)

In [22]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['well', 'stream', 'sump', 'waste water', 'reservoir', 'canal',
       'spring', 'lake', 'pond', 'winter runoff', 'ditch', 'drain',
       'sewage effluent', 'slough', 'WaDE Unspecified'], dtype=object)

In [23]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['WaDE Unspecified'], dtype=object)

In [24]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Formosa Exploration Inc', 'Parkin Family Llc', 'G A Baker', ...,
       'Marjorie Baird', 'Kennerly Ranches Llc', 'WaDE Unspecified'],
      dtype=object)

In [25]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['MINING', 'CRANBERRY', 'DOMESTIC EXPANDED', 'IRRIGATION',
       'SUPPLEMENTAL IRRIGATION', 'DOMESTIC', 'GROUP DOMESTIC',
       'DOMESTIC INCLUDING LAWN AND GARDEN', 'AGRICULTURE USES',
       'USE WITHIN A SCHOOL', 'DOMESTIC AND LIVESTOCK',
       'IRRIGATION OF CRANBERRIES', 'TEMPERATURE CONTROL', 'NURSERY USES',
       'HUMAN CONSUMPTION', 'COMMERCIAL USES',
       'INDUSTRIAL/MANUFACTURING USES', 'GEO-THERMAL (HEATING & COOLING)',
       'SAWMILL', 'LOG DECK SPRINKLING', 'FROST PROTECTION', 'SHOP',
       'LABORATORY', 'RECREATION', 'CAMPSITE', 'SWIMMING', 'RAM',
       'POWER DEVELOPMENT', 'GEO-THERMAL(ENERGY PRODUCTION)',
       'FISH CULTURE', 'FISH AND WILDLIFE', 'AQUACULTURE', 'LIVESTOCK',
       'GREENHOUSE', 'USE IN A MINT STILL', 'LIVESTOCK, WILDLIFE',
       'DAIRY BARN USES', 'HARVESTING OF CRANBERRIES',
       'SUPPLEMENTAL FLOOD HARVESTING', 'MUNICIPAL USES',
       'QUASI-MUNICIPAL USES', 'FISHERY ENHANCEMENT (INSTREAM)',
       'MULTIPLE INSTREAM USES',
      

In [26]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0)
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0)
outdf.head(1)

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,orD0,ORwr_M1,ORwr_V1,ORwr_O1,,,,FORMOSA 1 ADIT ...,,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.85581,-123.38288,,,POD,21755_1,POD6909,,well,OR,,,,,,,,,,0.04,WaDE Unspecified,21755,Formosa Exploration Inc,1989-11-21 00:00:00,,,,,,MINING,,,,,,0,,,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...


In [27]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array(['1989-11-21T00:00:00.000000000', '1997-11-14T00:00:00.000000000',
       '1922-07-01T00:00:00.000000000', ...,
       '2005-08-01T00:00:00.000000000', '1891-03-18T00:00:00.000000000',
       '1896-12-13T00:00:00.000000000'], dtype='datetime64[ns]')

In [28]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

array([0.04  , 0.005 , 0.1   , ..., 0.407 , 0.0207, 0.0828])

In [29]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

array([  0.  ,  22.3 ,   1.64, ...,  40.14,  96.84, 421.  ])

In [30]:
# Fixing in_AllocationCropDutyAmount datatype
outdf['in_AllocationCropDutyAmount'] = pd.to_numeric(outdf['in_AllocationCropDutyAmount'], errors='coerce').fillna(0)
outdf['in_AllocationCropDutyAmount'].unique()

array([0.0000e+00, 4.5000e+00, 3.0000e+00, 2.5000e+00, 4.0000e+00,
       1.0000e+00, 2.2300e+00, 5.2500e+00, 9.6000e-01, 3.9000e+00,
       3.5000e+00, 5.0000e+00, 2.3400e+00, 2.0000e+00, 6.0000e+00,
       7.1000e-01, 7.3000e-01, 1.5000e+00, 1.5300e+00, 1.3500e+00,
       1.4300e+00, 3.2500e+00, 1.7400e+00, 1.3200e+00, 2.9000e+00,
       9.0000e-01, 4.8000e+00, 2.5000e+01, 5.0000e-01, 3.2000e+00,
       1.3750e+00, 8.5500e-01, 2.4400e-01, 1.0080e+00, 2.5800e+00,
       1.8330e+00, 2.1400e-01, 6.1000e-02, 3.1000e-02, 1.7110e+00,
       1.6400e-01, 7.3300e-01, 9.2800e-01, 5.1900e-01, 8.0000e-01,
       1.9000e+00, 5.1000e-01, 2.5000e-03, 1.2000e+01, 7.5000e+00,
       4.6000e+00, 4.4000e+00, 7.0000e+00, 1.7000e-01, 3.0000e-02,
       3.6000e+00, 5.0000e-02, 1.5000e-01, 6.3000e-01, 1.2000e+00,
       2.6000e+00, 4.9000e+00, 1.5000e+01, 4.2000e+01, 4.2500e+01,
       2.2500e+01, 7.5000e-01, 4.2000e+00, 3.2900e+00, 1.6000e+00,
       1.3000e+00, 3.0000e+01, 3.5800e+00, 3.0000e-01, 7.0000e

In [31]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').fillna(0)
outdf['in_IrrigatedAcreage'].unique()

array([  0.  ,   1.29,  16.2 , ...,  69.65,  27.96, 117.17])

In [32]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3', ..., 'wadeID18849', 'wadeID18850',
       'wadeID18851'], dtype=object)

## Shapefile Data
- For attaching gemetry to csv inputs.

In [33]:
# PoU Shapefile Data
# Shapefile input
dfPoUshapetemp = gpd.read_file('shapefile/OR_PoU2.shp')
dfPoUshapetemp.head(3)

Unnamed: 0,pou_displa,pou_disp_1,wris_link,snp_id,pou_use_id,app_char,app_nbr,permit_cha,permit_nbr,cert_nbr,claim_char,claim_nbr,decree_tit,transfer_n,wr_type,name_last,name_first,name_compa,use_code,use_catego,use_code_d,priority_d,supplement,wris_acres,technician,agency,rec_creati,last_updt_,feature_qu,delta_size,remarks,Shape_Leng,Shape_Area,geometry
0,App: P 81441 * LV,P 81441,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,5135,4124,P,81441,,0,0,,0,,,ST,WILLIAMS,TERRY,DASH W BAR RANCH,LV,8,LIVESTOCK,1996-08-07,0,0.0,MW,OWRD,2005-06-30,2005-07-22,30,0.0,PLACED USING DRG,2654.80928,281203.13083,"POLYGON ((-118.36564 43.73765, -118.36428 43.7..."
1,App: P 82980 * ST,P 82980,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,6333,5886,P,82980,,0,0,,0,,,ST,PARKER,SOFIA,,ST,M,STORAGE,1997-01-07,0,0.0,BRW,OWRD,2016-04-11,2016-04-11,30,0.0,PLACED USING 2014 IMAGERY,1437.65417,35294.34007,"POLYGON ((-123.04147 42.44192, -123.04133 42.4..."
2,Inchoate: T 4213 CF (REG) * IR,T 4213,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,17008,7696,,0,,0,0,,0,East Mud Creek,T 4213,SW,RENCKEN,DONALD,,IR,3,IRRIGATION,1892-12-31,0,10.0,MIGRT,OWRD,1997-12-01,1997-12-01,0,0.0,0 CD 31,2426.48477,379563.11666,"POLYGON ((-118.43940 45.99213, -118.44205 45.9..."


In [34]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['pou_use_id'].replace("", 0).fillna(0).astype(int).astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

Unnamed: 0,in_SiteNativeID,geometry
0,POU4124,"POLYGON ((-118.36564 43.73765, -118.36428 43.7..."
1,POU5886,"POLYGON ((-123.04147 42.44192, -123.04133 42.4..."
2,POU7696,"POLYGON ((-118.43940 45.99213, -118.44205 45.9..."


## The Output

In [35]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447651 entries, 0 to 447650
Data columns (total 63 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   WaDEUUID                                      447651 non-null  object        
 1   in_MethodUUID                                 447651 non-null  object        
 2   in_VariableSpecificUUID                       447651 non-null  object        
 3   in_OrganizationUUID                           447651 non-null  object        
 4   in_Geometry                                   447651 non-null  object        
 5   in_GNISFeatureNameCV                          447651 non-null  object        
 6   in_WaterQualityIndicatorCV                    447651 non-null  object        
 7   in_WaterSourceName                            447651 non-null  object        
 8   in_WaterSourceNativeID                        447651 n

In [36]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,orD0,ORwr_M1,ORwr_V1,ORwr_O1,,,,FORMOSA 1 ADIT ...,wadeID1,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.85581,-123.38288,,,POD,21755_1,POD6909,,well,OR,,,,,,,,0.00000,,0.04000,WaDE Unspecified,21755,Formosa Exploration Inc,1989-11-21,,,,,0.00000,MINING,,,,,,0,,0.00000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
1,orD1,ORwr_M1,ORwr_V1,ORwr_O1,,,,SILVER BUTTE 1 ADIT ...,wadeID2,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.85455,-123.38349,,,POD,21755_2,POD6910,,well,OR,,,,,,,,0.00000,,0.00500,WaDE Unspecified,21755,Formosa Exploration Inc,1989-11-21,,,,,0.00000,MINING,,,,,,0,,0.00000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
2,orD10,ORwr_M1,ORwr_V1,ORwr_O1,,,,A WELL ...,wadeID3,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.55087,-123.19385,,,POD,24155_1,POD10776,,well,OR,,,,,,,,0.00000,,0.10000,WaDE Unspecified,24155,Parkin Family Llc,1997-11-14,,,,,0.00000,MINING,,,,,,0,,0.00000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
3,orD100,ORwr_M1,ORwr_V1,ORwr_O1,,,,COVE CREEK ...,wadeID4,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,42.55030,-123.30492,,,POD,56343_1,POD53167,,stream,OR,,,,,,,,0.00000,,2.50000,WaDE Unspecified,56343,G A Baker,1922-07-01,,,,,0.00000,MINING,,,,,,0,,0.00000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
4,orD1000,ORwr_M1,ORwr_V1,ORwr_O1,,,,A SUMP,wadeID5,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,43.11618,-124.35876,,,POD,23642_4,POD9919,,sump,OR,,,,,,,,0.00000,,0.09510,WaDE Unspecified,23642,James Jackson Jackson Family Trust,1992-03-02,,,,,0.00000,CRANBERRY,,,,,,0,,0.00000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447646,orU99998,ORwr_M1,ORwr_V1,ORwr_O1,,,,A WELL,wadeID8,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.48123,-123.06158,,,POU,WaDE Unspecified,POU257027,,WaDE Unspecified,OR,,,,,,,,2.50000,,0.39000,WaDE Unspecified,198961,Donald Jesse,1977-06-27,,,,,0.00000,SUPPLEMENTAL IRRIGATION,,,,,,0,,13.60000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
447647,orU99999,ORwr_M1,ORwr_V1,ORwr_O1,,,,A RESERVOIR,wadeID16,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.48310,-123.06139,,,POU,WaDE Unspecified,POU257028,,WaDE Unspecified,OR,,,,,,,,2.50000,,0.00000,WaDE Unspecified,198962,Alan Jesse,1997-04-19,,,,,61.09380,IRRIGATION,,,,,,0,,166.50000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
447648,orU99999,ORwr_M1,ORwr_V1,ORwr_O1,,,,A RESERVOIR,wadeID16,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.48310,-123.06139,,,POU,WaDE Unspecified,POU257028,,WaDE Unspecified,OR,,,,,,,,2.50000,,0.00000,WaDE Unspecified,198962,Alan Jesse,1997-04-19,,,,,22.45610,SUPPLEMENTAL IRRIGATION,,,,,,0,,166.50000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...
447649,orU99999,ORwr_M1,ORwr_V1,ORwr_O1,,,,A RESERVOIR,wadeID16,Surface Water,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,45.48310,-123.06139,,,POU,WaDE Unspecified,POU257028,,WaDE Unspecified,OR,,,,,,,,2.50000,,0.00000,WaDE Unspecified,198962,Alan Jesse,1997-04-19,,,,,1.79790,IRRIGATION,,,,,,0,,166.50000,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...


In [37]:
# Export the output dataframe
outdf.to_csv('Pwr_orMain.zip', index=False, compression="zip")  # The output, save as a zip
dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.