# Pre-processing (state / organization Name) Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one master file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Texas/WaterAllocation_TWDB" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Texas/WaterAllocation_TWDB


## Point of Diversion Data

In [3]:
# Input File
FI_PoD = "RawInputData/TWDB_Groundwater.csv"
dfinPOD = pd.read_csv(FI_PoD, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "txD" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawInputData/TWDB_Groundwater.zip', compression=dict(method='zip', archive_name='TWDB_Groundwater.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head()

141605


Unnamed: 0,StateWellN,OwnerName,PrimaryWat,Elevation,WellDepth,WaterLevel,WaterQuali,AquiferCod,CoordDDLat,CoordDDLon,CountyName,WellType,WaDEUUID
0,657116,1. B. Dugger Estate,Irrigation,3722,275.0,,N,121OGLL - Ogallala Formation,35.11361,-101.97611,Randall,Withdrawal of Water,txD0
1,657118,L. W. Smith,Irrigation,3679,274.0,Miscellaneous Measurements,N,121OGLL - Ogallala Formation,35.085,-101.98695,Randall,Withdrawal of Water,txD1
2,657207,Amarillo Municipal Water System,Public Supply,3705,305.0,Miscellaneous Measurements,Y,121OGLL - Ogallala Formation,35.11722,-101.94889,Randall,Withdrawal of Water,txD2
3,657209,Ervin Podzemny,Irrigation,3707,285.0,Miscellaneous Measurements,N,121OGLL - Ogallala Formation,35.10556,-101.94611,Randall,Withdrawal of Water,txD3
4,657211,Caroline Bush Emery,Irrigation,3685,258.0,Miscellaneous Measurements,N,121OGLL - Ogallala Formation,35.10195,-101.95278,Randall,Withdrawal of Water,txD4


In [4]:
def extractNativeIDFunc(A):
    A = str(A).strip()
    try:
        outString = A.split('-')[1]
    except:
        outString = A
    return outString
dfinPOD['nativeName'] = dfinPOD.apply(lambda row: extractNativeIDFunc(row['AquiferCod']), axis=1)
dfinPOD['nativeName'].unique()

array([' Ogallala Formation', ' Dockum Formation', ' Quaternary Alluvium',
       ' Alluvium', 'APPL ', ' Paluxy Sand',
       ' Travis Peak Formation and Paluxy Sand', ' Midway Group',
       ' Nacatoch Sand', ' Ogallala Formation and Dockum Formation',
       ' Ochoan Series', ' Wilcox Group', ' Seymour Formation',
       ' Blossom Sand', ' Taylor Marl', ' San Angleo Formation',
       ' Fredericksburg Group', ' Alluvium and Ogallala Formation',
       ' Purgatoire and Morrison Formations',
       ' Quartermaster Formation and Whitehorse Group',
       ' Terrace Deposits', ' Canyon Group', ' Whitehorse Group',
       ' Jurassic System',
       ' Ogallala Formation, Dakota Group and Purgatoire Formation',
       ' Ogallala and Morrison Formations',
       ' Dakota Group, Purgatoire Formation and Jurassic',
       ' Aquifer Not Able to be Determined',
       ' Dakota Group and Purgatoire Formation', ' Hosston Formation',
       ' Choza Formation', ' Wichita Formation or Group',
       

In [5]:
def extractNativeIDFunc(A):
    A = str(A).strip()
    try:
        outString = A.split('-')[0]
    except:
        outString = A
    return outString
dfinPOD['nativeID'] = dfinPOD.apply(lambda row: extractNativeIDFunc(row['AquiferCod']), axis=1)
dfinPOD['nativeID'].unique()

array(['121OGLL ', '231DCKM ', '110ALVM ', '100ALVM ', 'NOT', '218PLXY ',
       '218TPPX ', '125MDWY ', '211NCTC ', '121OGDK ', '312OCHO ',
       '124WLCX ', '112SYMR ', '211BLSM ', '211TYLR ', '318SAGL ',
       '218FKBG ', '110AVOG ', '217PGTM ', '310QRMW ', '110TRRC ',
       '321CNYN ', '313WTRS ', '220JRSC ', '121OGDP ', '121OGLM ',
       '211DKPJ ', 'UNKNOWN ', '211DKOP ', '217HSTN ', '318CHOZ ',
       '318WCHT ', '112TAOG ', '110AHTP ', '121OGFG ', '218ALRS ',
       '313BLIN ', '218ASDG ', '124CPRS ', '110AVAN ', '110QRNR ',
       '218FKBT ', '218GPSH ', '218TVPK ', '212WDBN ', '218GLRS ',
       '121OGLD ', '318CLFK ', '112TEDAS ', '318PRVR ', '218TRNT ',
       '124RKCZ ', '218TWMT ', '218PRSL ', '310PRMN ', '121OGFA ',
       '100PECS ', '211EGFD ', '124WXMW ', '124QNCT ', '112SCFX ',
       '110DUNE ', '324MLWL ', '110ALTO ', '121OGLW ', '313DCKB ',
       '218FWDK ', '300PLZC ', '211GOBR ', '110AVPW ', '318MRKL ',
       '110ACPO ', '110AVPS ', '371HCKR ', '321CSCO ',

In [6]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "TWDBwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "TWDBwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "TWDBwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['nativeName']
df['in_WaterSourceNativeID'] = dfinPOD['nativeID']
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "Digitized"
df['in_County'] = dfinPOD["CountyName"]
df['in_EPSGCodeCV'] = ""
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD["CoordDDLat"]
df['in_Longitude'] = dfinPOD["CoordDDLon"]
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"  # "Point of Diversion"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "POD" + dfinPOD["StateWellN"].astype(int).astype(str).str.strip()
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOD["WellType"]
df['in_StateCV'] = "TX"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] =""
df['in_AllocationNativeID'] =  dfinPOD["StateWellN"].astype(int).astype(str).str.strip()
df['in_AllocationOwner'] = dfinPOD["OwnerName"]
df['in_AllocationPriorityDate'] = ""
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = "Rules of Capture"
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfinPOD["PrimaryWat"]
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "1" # either a 1 or 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://www3.twdb.texas.gov/apps/waterdatainteractive//GetReports.aspx?Num=" + dfinPOD["StateWellN"].astype(int).astype(str).str.strip() + "&Type=GWDB"

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

141605


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,txD0,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121OGLL,Groundwater,,Digitized,Randall,,,,,35.11361,-101.97611,,,POD,,POD657116,,Withdrawal of Water,TX,,,,,,,,,,,,657116,1. B. Dugger Estate,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
1,txD1,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121OGLL,Groundwater,,Digitized,Randall,,,,,35.085,-101.98695,,,POD,,POD657118,,Withdrawal of Water,TX,,,,,,,,,,,,657118,L. W. Smith,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
2,txD2,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121OGLL,Groundwater,,Digitized,Randall,,,,,35.11722,-101.94889,,,POD,,POD657207,,Withdrawal of Water,TX,,,,,,,,,,,,657207,Amarillo Municipal Water System,,,,,Rules of Capture,,Public Supply,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
3,txD3,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121OGLL,Groundwater,,Digitized,Randall,,,,,35.10556,-101.94611,,,POD,,POD657209,,Withdrawal of Water,TX,,,,,,,,,,,,657209,Ervin Podzemny,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
4,txD4,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121OGLL,Groundwater,,Digitized,Randall,,,,,35.10195,-101.95278,,,POD,,POD657211,,Withdrawal of Water,TX,,,,,,,,,,,,657211,Caroline Bush Emery,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...


## Concatenate POD Data.  Make needed changes

In [7]:
outdf = outPOD
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")

print(len(outdf))

141605


## Clean Data / data types

In [8]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [9]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Ogallala Formation', 'Dockum Formation', 'Quaternary Alluvium',
       'Alluvium', 'Appl', 'Paluxy Sand',
       'Travis Peak Formation And Paluxy Sand', 'Midway Group',
       'Nacatoch Sand', 'Ogallala Formation And Dockum Formation',
       'Ochoan Series', 'Wilcox Group', 'Seymour Formation',
       'Blossom Sand', 'Taylor Marl', 'San Angleo Formation',
       'Fredericksburg Group', 'Alluvium And Ogallala Formation',
       'Purgatoire And Morrison Formations',
       'Quartermaster Formation And Whitehorse Group', 'Terrace Deposits',
       'Canyon Group', 'Whitehorse Group', 'Jurassic System',
       'Ogallala Formation, Dakota Group And Purgatoire Formation',
       'Ogallala And Morrison Formations',
       'Dakota Group, Purgatoire Formation And Jurassic',
       'Aquifer Not Able To Be Determined',
       'Dakota Group And Purgatoire Formation', 'Hosston Formation',
       'Choza Formation', 'Wichita Formation Or Group',
       'Tahoka And Ogallala Formations',
     

In [10]:
outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceNativeID']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['121Ogll', '231Dckm', '110Alvm', '100Alvm', 'Not', '218Plxy',
       '218Tppx', '125Mdwy', '211Nctc', '121Ogdk', '312Ocho', '124Wlcx',
       '112Symr', '211Blsm', '211Tylr', '318Sagl', '218Fkbg', '110Avog',
       '217Pgtm', '310Qrmw', '110Trrc', '321Cnyn', '313Wtrs', '220Jrsc',
       '121Ogdp', '121Oglm', '211Dkpj', 'Unknown', '211Dkop', '217Hstn',
       '318Choz', '318Wcht', '112Taog', '110Ahtp', '121Ogfg', '218Alrs',
       '313Blin', '218Asdg', '124Cprs', '110Avan', '110Qrnr', '218Fkbt',
       '218Gpsh', '218Tvpk', '212Wdbn', '218Glrs', '121Ogld', '318Clfk',
       '112Tedas', '318Prvr', '218Trnt', '124Rkcz', '218Twmt', '218Prsl',
       '310Prmn', '121Ogfa', '100Pecs', '211Egfd', '124Wxmw', '124Qnct',
       '112Scfx', '110Dune', '324Mlwl', '110Alto', '121Oglw', '313Dckb',
       '218Fwdk', '300Plzc', '211Gobr', '110Avpw', '318Mrkl', '110Acpo',
       '110Avps', '371Hckr', '321Csco', '110Avcz', '110Avvl', '319Publ',
       '124Sprt', '110Ahtw', '124Rklw', '112Sycz', '31

In [11]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [12]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['1 B Dugger Estate', 'L W Smith',
       'Amarillo Municipal Water System', ..., 'Rafael Cantu',
       'Daniel Garcia', 'Carlos Alexander'], dtype=object)

In [13]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [14]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Ogallala Formation', 'Dockum Formation', 'Quaternary Alluvium',
       'Alluvium', 'Appl', 'Paluxy Sand',
       'Travis Peak Formation And Paluxy Sand', 'Midway Group',
       'Nacatoch Sand', 'Ogallala Formation And Dockum Formation',
       'Ochoan Series', 'Wilcox Group', 'Seymour Formation',
       'Blossom Sand', 'Taylor Marl', 'San Angleo Formation',
       'Fredericksburg Group', 'Alluvium And Ogallala Formation',
       'Purgatoire And Morrison Formations',
       'Quartermaster Formation And Whitehorse Group', 'Terrace Deposits',
       'Canyon Group', 'Whitehorse Group', 'Jurassic System',
       'Ogallala Formation, Dakota Group And Purgatoire Formation',
       'Ogallala And Morrison Formations',
       'Dakota Group, Purgatoire Formation And Jurassic',
       'Aquifer Not Able To Be Determined',
       'Dakota Group And Purgatoire Formation', 'Hosston Formation',
       'Choza Formation', 'Wichita Formation Or Group',
       'Tahoka And Ogallala Formations',
     

In [15]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater'], dtype=object)

In [16]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Withdrawal of Water', 'Observation', 'Spring', 'Test Hole',
       'Drain', 'Oil or Gas', 'Surface Water (not a spring)', 'Seismic',
       'Waste Disposal', 'Mine', 'Recharge', 'Other (see remarks)',
       'Geothermal', '', 'Anode'], dtype=object)

In [17]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [18]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['1 B Dugger Estate', 'L W Smith',
       'Amarillo Municipal Water System', ..., 'Rafael Cantu',
       'Daniel Garcia', 'Carlos Alexander'], dtype=object)

In [19]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Irrigation', 'Public Supply', 'Stock', 'Domestic', 'Unused',
       'Industrial', '', 'Plugged or Destroyed', 'Recreation',
       'Commercial', 'Power', 'Institution', 'Industrial (cooling)',
       'Unknown', 'Aquaculture', 'De-watering', 'Other', 'Medicinal',
       'Monitor', 'Mining', 'Air Conditioning', 'Rig Supply', 'Fire',
       'Withdrawal of Water', 'Bottling', 'Fracking Supply', 'Test Well',
       'Desalination', 'Extraction'], dtype=object)

In [20]:
# Unique values for 'WaterSourceTypeCV'
for x in outdf['in_BeneficialUseCategory'].sort_values().unique():
    print(f'"' + x + '",')

"",
"Air Conditioning",
"Aquaculture",
"Bottling",
"Commercial",
"De-watering",
"Desalination",
"Domestic",
"Extraction",
"Fire",
"Fracking Supply",
"Industrial",
"Industrial (cooling)",
"Institution",
"Irrigation",
"Medicinal",
"Mining",
"Monitor",
"Other",
"Plugged or Destroyed",
"Power",
"Public Supply",
"Recreation",
"Rig Supply",
"Stock",
"Test Well",
"Unknown",
"Unused",
"Withdrawal of Water",


In [21]:
# Ensure Latitude entry is either numireic or blank, no 0 entries
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([35.113612, 35.085   , 35.117222, ..., 27.433334, 27.406667,
       27.309723])

In [22]:
# Ensure Longitude entry is either numireic or blank, no 0 entries
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-101.976111 , -101.986945 , -101.948889 , ...,  -99.6227778,
        -99.475    , -103.9730556])

In [23]:
# # Changing datatype of Priority Date to date fields entry
# outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
# outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
# outdf['in_AllocationPriorityDate'].unique()

In [24]:
# Ensure Flow entry is either numireic or blank, no 0 entries
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([''], dtype=object)

In [25]:
# Ensure Volume entry is either numireic or blank, no 0 entries
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array([''], dtype=object)

In [26]:
# missing WaterSourceNativeID in the natve it.  Swill put in a blank value for now on missing vields.

# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B, C):
    C = str(C).strip()
    if (C != ""):
        outList = C
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = C
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV'], row['in_WaterSourceNativeID']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['121Ogll', '231Dckm', '110Alvm', '100Alvm', 'Not', '218Plxy',
       '218Tppx', '125Mdwy', '211Nctc', '121Ogdk', '312Ocho', '124Wlcx',
       '112Symr', '211Blsm', '211Tylr', '318Sagl', '218Fkbg', '110Avog',
       '217Pgtm', '310Qrmw', '110Trrc', '321Cnyn', '313Wtrs', '220Jrsc',
       '121Ogdp', '121Oglm', '211Dkpj', 'Unknown', '211Dkop', '217Hstn',
       '318Choz', '318Wcht', '112Taog', '110Ahtp', '121Ogfg', '218Alrs',
       '313Blin', '218Asdg', '124Cprs', '110Avan', '110Qrnr', '218Fkbt',
       '218Gpsh', '218Tvpk', '212Wdbn', '218Glrs', '121Ogld', '318Clfk',
       '112Tedas', '318Prvr', '218Trnt', '124Rkcz', '218Twmt', '218Prsl',
       '310Prmn', '121Ogfa', '100Pecs', '211Egfd', '124Wxmw', '124Qnct',
       '112Scfx', '110Dune', '324Mlwl', '110Alto', '121Oglw', '313Dckb',
       '218Fwdk', '300Plzc', '211Gobr', '110Avpw', '318Mrkl', '110Acpo',
       '110Avps', '371Hckr', '321Csco', '110Avcz', '110Avvl', '319Publ',
       '124Sprt', '110Ahtw', '124Rklw', '112Sycz', '31

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: {enter string entries here}

## Shapefile Data
- For attaching geometry to POU csv inputs.

## Export Data

In [27]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141605 entries, 0 to 141604
Data columns (total 63 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   WaDEUUID                                      141605 non-null  object 
 1   in_MethodUUID                                 141605 non-null  object 
 2   in_VariableSpecificUUID                       141605 non-null  object 
 3   in_OrganizationUUID                           141605 non-null  object 
 4   in_Geometry                                   141605 non-null  object 
 5   in_GNISFeatureNameCV                          141605 non-null  object 
 6   in_WaterQualityIndicatorCV                    141605 non-null  object 
 7   in_WaterSourceName                            141605 non-null  object 
 8   in_WaterSourceNativeID                        141605 non-null  object 
 9   in_WaterSourceTypeCV                          14

In [28]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,txD0,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121Ogll,Groundwater,,Digitized,Randall,,,,,35.11361,-101.97611,,,POD,,POD657116,,Withdrawal of Water,TX,,,,,,,,,,,,657116,1 B Dugger Estate,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
1,txD1,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121Ogll,Groundwater,,Digitized,Randall,,,,,35.08500,-101.98695,,,POD,,POD657118,,Withdrawal of Water,TX,,,,,,,,,,,,657118,L W Smith,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
2,txD2,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121Ogll,Groundwater,,Digitized,Randall,,,,,35.11722,-101.94889,,,POD,,POD657207,,Withdrawal of Water,TX,,,,,,,,,,,,657207,Amarillo Municipal Water System,,,,,Rules of Capture,,Public Supply,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
3,txD3,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121Ogll,Groundwater,,Digitized,Randall,,,,,35.10556,-101.94611,,,POD,,POD657209,,Withdrawal of Water,TX,,,,,,,,,,,,657209,Ervin Podzemny,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
4,txD4,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Ogallala Formation,121Ogll,Groundwater,,Digitized,Randall,,,,,35.10195,-101.95278,,,POD,,POD657211,,Withdrawal of Water,TX,,,,,,,,,,,,657211,Caroline Bush Emery,,,,,Rules of Capture,,Irrigation,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141600,txD141600,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Laredo Formation,124Lrdo,Groundwater,,Digitized,Webb,,,,,27.49056,-99.42361,,,POD,,POD8537207,,Withdrawal of Water,TX,,,,,,,,,,,,8537207,Gonzalez,,,,,Rules of Capture,,,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
141601,txD141601,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Laredo Formation,124Lrdo,Groundwater,,Digitized,Webb,,,,,27.43333,-99.48250,,,POD,,POD8537405,,Withdrawal of Water,TX,,,,,,,,,,,,8537405,Rafael Cantu,,,,,Rules of Capture,,Domestic,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
141602,txD141602,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Laredo Formation,124Lrdo,Groundwater,,Digitized,Webb,,,,,27.40667,-99.47861,,,POD,,POD8537702,,Withdrawal of Water,TX,,,,,,,,,,,,8537702,Gf Link,,,,,Rules of Capture,,Stock,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...
141603,txD141603,TWDBwr_M1,TWDBwr_V1,TWDBwr_O1,,,,Laredo Formation,124Lrdo,Groundwater,,Digitized,Webb,,,,,27.45722,-99.01639,,,POD,,POD8540601,,Withdrawal of Water,TX,,,,,,,,,,,,8540601,Daniel Garcia,,,,,Rules of Capture,,Stock,,,,,,1,,,,,,,,,,https://www3.twdb.texas.gov/apps/waterdatainte...


In [29]:
# Export the output dataframe
# change output name / abbreviation to match native state provdier and wade data type 
outdf.to_csv('RawInputData/Pwr_twdbMain.zip', compression=dict(method='zip', archive_name='Pwr_twdbMain.csv'), index=False)  # The output, save as a zip