# Pre-processing Kansas Allocation data for WaDEQA upload.
Date Updated: 01/05/2020
Purpose:  To pre-process the Kansas data into one master file for simple DataFrame creation and extraction

Data comes in water quantity data (qty) and location data (wimas).  Will need to combine qty data with the wimas data via wr_ID + pdiv_id key.  See 'KS_Allocation Schema Mapping to WaDE_QA.xlsx' for additional mapping information.

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Kansas/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data

In [3]:
# POD Data - qty_input
fileInput = "qty_input.zip"
df_qty = pd.read_csv(fileInput, compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_qty:
    df_qty['WaDEUUID'] = "ksqID" + df_qty.index.astype(str)
    df_qty.to_csv('qty_input.zip', compression=dict(method='zip', archive_name='qty_input.csv'), index=False)

df_qty['KeyJoin'] = df_qty['wr_id'].astype(str) + "_" + df_qty['pdiv_id'].astype(str)
print(len(df_qty))
df_qty.head(1)

45666


Unnamed: 0,WaDEUUID,ReasonRemoved,IncompleteField,wr_id,right_type,vcnty_code,wr_num,wr_qual,umw_code,source,fo_num,basin,stream,gmd,county,wrf_status,pdiv_id,twp,twp_dir,rng,rng_dir,sect,dwr_id,feet_north,feet_west,qual4,qual3,qual2,qual1,nwb,quant_id,auth_quant,add_quant,quant_unit,qstor_ind,well_kid,wimas_date,KeyJoin
0,ksqID0,Unused WaterSource Record,,36,A,,36,0,IRR,G,3,18,,,OT,NK,22899,12,S,5,W,22,1,,,,SW,NW,NW,1,36,134.0,134.0,AF,1,1043567968,12/07/2020,36_22899


In [4]:
# POD Data - qty_input
fileInput = "wimas_input.zip"
df_wimas = pd.read_csv(fileInput, compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_wimas:
    df_wimas['WaDEUUID'] = "kswID" + df_wimas.index.astype(str)
    df_wimas.to_csv('wimas_input.zip', compression=dict(method='zip', archive_name='wimas_input.csv'), index=False)

df_wimas['KeyJoin'] = df_wimas['wr_id'].astype(str) + "_" + df_wimas['pdiv_id'].astype(str)
print(len(df_wimas))
df_wimas.head(1)

84495


  df_wimas = pd.read_csv(fileInput, compression='zip')


Unnamed: 0,wr_id,right_type,vested_county_code,wr_num,wr_qualifier,umw_code,wrfile_active_ind,source_of_supply,current_status_code,priority_date,pdiv_id,fpdiv_active_ind,township,township_dir,range_num,range_dir,section_num,dwr_id,qual1,qual2,qual3,qual4,longitude,latitude,fpdiv_comment,feet_north,feet_west,basin_num,gmd,fo_num,county_code,stream_num,num_wells,lot_number,lot_qualifier_one,lot_qualifier_two,well_kid,wimas_date,WaDEUUID,KeyJoin
0,1,A,,1,0,IRR,0,S,FO,06-MAY-1941,66333,1,20,S,17,E,2,1,SE,SE,SW,,-95.49291,38.33276,,336.0,934.0,5,,1,AN,1804.0,,,,,,12/07/2020,kswID0,1_66333


In [5]:
#Merging dataframes into one, using left-join.
dfinPOD = pd.merge(df_qty, df_wimas, left_on='KeyJoin', right_on='KeyJoin', how='inner')
dfinPOD = dfinPOD.drop_duplicates().reset_index(drop=True)
print(len(dfinPOD))
dfinPOD.head()

47313


Unnamed: 0,WaDEUUID_x,ReasonRemoved,IncompleteField,wr_id_x,right_type_x,vcnty_code,wr_num_x,wr_qual,umw_code_x,source,fo_num_x,basin,stream,gmd_x,county,wrf_status,pdiv_id_x,twp,twp_dir,rng,rng_dir,sect,dwr_id_x,feet_north_x,feet_west_x,qual4_x,qual3_x,qual2_x,qual1_x,nwb,quant_id,auth_quant,add_quant,quant_unit,qstor_ind,well_kid_x,wimas_date_x,KeyJoin,wr_id_y,right_type_y,vested_county_code,wr_num_y,wr_qualifier,umw_code_y,wrfile_active_ind,source_of_supply,current_status_code,priority_date,pdiv_id_y,fpdiv_active_ind,township,township_dir,range_num,range_dir,section_num,dwr_id_y,qual1_y,qual2_y,qual3_y,qual4_y,longitude,latitude,fpdiv_comment,feet_north_y,feet_west_y,basin_num,gmd_y,fo_num_y,county_code,stream_num,num_wells,lot_number,lot_qualifier_one,lot_qualifier_two,well_kid_y,wimas_date_y,WaDEUUID_y
0,ksqID0,Unused WaterSource Record,,36,A,,36,0,IRR,G,3,18,,,OT,NK,22899,12,S,5,W,22,1,,,,SW,NW,NW,1.0,36,134.0,134.0,AF,1,1043567968.0,12/07/2020,36_22899,36,A,,36,0,IRR,1,G,NK,07-JAN-1943,22899,1,12,S,5,W,22,1,NW,NW,SW,,-97.87295,38.9982,,,,18,,3,OT,,1.0,,,,1043567968.0,12/07/2020,kswID5
1,ksqID1,Unused WaterSource Record,,37,A,,37,0,IND,S,1,8,8.0,,JO,NK,41446,12,S,22,E,20,2,2900.0,2800.0,,,,,,101838,42522.503,42522.503,AF,1,,12/07/2020,37_41446,37,A,,37,0,IND,1,S,NK,09-JAN-1943,41446,1,12,S,22,E,20,2,,,,,-94.99305,38.99368,,2900.0,2800.0,8,,1,JO,8.0,,3.0,,,,12/07/2020,kswID6
2,ksqID10,Unused WaterSource Record,,73,A,,72,0,IRR,S,3,26,26.0,,OB,NK,7810,6,S,11,W,28,1,60.0,5200.0,,SW,SW,SW,,73,200.0,200.0,AF,1,,12/07/2020,73_7810,73,A,,72,0,IRR,1,S,NK,11-JAN-1946,7810,1,6,S,11,W,28,1,SW,SW,SW,,-98.56293,39.49494,,60.0,5200.0,26,,3,OB,26.0,,,,,,12/07/2020,kswID43
3,ksqID100,,,228,A,,226,0,IRR,G,4,33,,,HM,NK,63758,24,S,41,W,3,8,3380.0,5025.0,,SW,SW,NW,1.0,230,216.0,216.0,AF,1,1042095136.0,12/07/2020,228_63758,228,A,,226,0,IRR,1,G,NK,13-JUN-1947,63758,1,24,S,41,W,3,8,NW,SW,SW,,-101.816,37.99599,,3380.0,5025.0,33,,4,HM,,1.0,,,,1042095136.0,12/07/2020,kswID198
4,ksqID1000,,,1911,A,,1868,0,IRR,S,2,33,810.0,,FO,NK,31840,28,S,22,W,6,4,,,,SW,NW,SW,,1949,150.0,150.0,AF,1,,12/07/2020,1911_31840,1911,A,,1868,0,IRR,1,S,NK,01-OCT-1953,31840,1,28,S,22,W,6,4,SW,NW,SW,,-99.77663,37.63564,,,,33,,2,FO,810.0,,,,,,12/07/2020,kswID2080


In [6]:
#Right Type Code
rightTypeDict = {
"A" : "Appropriation",
"B" : "Basin Term",
"D" : "Domestic",
"P" : "Temporary",
"T" : "Term",
"V" : "Vested"
}

def retrieveRightType(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Blank"
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = rightTypeDict[String1]
        except:
            outList = "WaDE Blank"
    return outList

dfinPOD['in_AllocationTypeCV'] = dfinPOD.apply(lambda row: retrieveRightType(row['right_type_x']), axis=1)
dfinPOD['in_AllocationTypeCV'].unique()

array(['Appropriation', 'Vested'], dtype=object)

In [7]:
#BenUse Code
useTypeDict = {
"ART" : "Artificial Recharge",
"CON" : "Contamination Remediation",
"DEW" : "Dewatering",
"DOM" : "Domestic",
"FPR" : "Fire Protection",
"HYD" : "Hydraulic Dredging",
"IND" : "Industrial",
"IRR" : "Irrigation",
"MUN" : "Municipal",
"REC" : "Recreation",
"SED" : "Sediment Storage",
"STK" : "Stockwater",
"THX" : "Thermal Exchange",
"WTR" : "Water Power"
}

def retrieveUseType(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Blank"
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = useTypeDict[String1]
        except:
            outList = "WaDE Blank"
    return outList

dfinPOD['in_BeneficialUseCategory'] = dfinPOD.apply(lambda row: retrieveUseType(row['umw_code_x']), axis=1)
dfinPOD['in_BeneficialUseCategory'].unique()

array(['Irrigation', 'Industrial', 'Municipal', 'Stockwater',
       'Recreation', 'Domestic'], dtype=object)

In [8]:
#Watersource Type Code
wsTypeDict = {
"S" : "Surface Water",
"G" : "Groundwater"}

def retrieveWSType(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Blank"
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = wsTypeDict[String1]
        except:
            outList = "WaDE Blank"
    return outList

dfinPOD['in_WatersourceType'] = dfinPOD.apply(lambda row: retrieveWSType(row['source']), axis=1)
dfinPOD['in_WatersourceType'].unique()

array(['Groundwater', 'Surface Water'], dtype=object)

In [9]:
#Status Type Code
statusTypeDict = {
"AA" : "Vested Active",
"AM" : "Dismissed After Vested",
"AY" : "Pending Initial Review",
"FO" : "Dismissed Prior to Approval",
"GA" : "Denied Prior to approval",
"GM" : "Reinstated Prior to Approval",
"GY" : "Approved Pending Completion",
"HK" : "Extended Time to Complete",
"HW" : "Dismissed Pending Completion",
"II" : "Reinstated Pending Completion",
"IU" : "Partial Completion",
"JG" : "Partial Completion Extended Time to Complete",
"JM" : "Inspected Prior to Completion",
"KE" : "Completed Pending Inspection",
"KK" : "Completed Extended Time to Perfect",
"KQ" : "Dismissed Pending Inspection",
"LC" : "Reinstated Pending inspection",
"LG" : "Completed Partial inspection",
"LK" : "Partial Inspection Extended Time to Perfect",
"LO" : "Inspected Pending Perfection",
"LR" : "Inspected Pending Perfection Extended Time to Perfect",
"LU" : "Dismissed Pending Perfection",
"LZ" : "Reinstated Pending Perfection",
"MM" : "Proposed Certificate",
"MR" : "Proposed Certificate Extended Time to Perfect",
"NK" : "Certificated Issued",
"NQ" : "Dismissed After Certificated Issued",
"NT" : "Reinstated After Certificate Issued",
"NV" : "Reinstated After Vested"
}

def retrieveStatusType(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = statusTypeDict[String1]
        except:
            outList = "WaDE Unspecified"
    return outList

dfinPOD['in_AllocationLegalStatusCV'] = dfinPOD.apply(lambda row: retrieveStatusType(row['wrf_status']), axis=1)
dfinPOD['in_AllocationLegalStatusCV'].unique()

array(['Certificated Issued', 'Reinstated After Certificate Issued',
       'Inspected Pending Perfection Extended Time to Perfect',
       'Proposed Certificate', 'Completed Extended Time to Perfect',
       'Inspected Pending Perfection',
       'Proposed Certificate Extended Time to Perfect', 'Vested Active',
       'Reinstated After Vested', 'Completed Pending Inspection',
       'Reinstated Pending Perfection', 'Extended Time to Complete',
       'Approved Pending Completion',
       'Partial Inspection Extended Time to Perfect',
       'Completed Partial inspection', 'WaDE Unspecified',
       'Partial Completion Extended Time to Complete',
       'Inspected Prior to Completion'], dtype=object)

In [10]:
#Basin Code
basinDict = {
"1" : "Missouri River",
"2" : "S F Big Nemaha River",
"3" : "Marais Des Cygnes River",
"4" : "Sugar Creek",
"5" : "Pottawatomie Creek",
"6" : "Little Osage River",
"7" : "Marmaton River",
"8" : "Kansas River",
"9" : "Stranger Creek",
"10" : "Wakarusa River",
"11" : "Delaware River",
"12" : "Vermillion Creek",
"13" : "Big Blue River",
"14" : "Black Vermillion River",
"15" : "Little Blue River",
"16" : "Mill Creek",
"17" : "Smoky Hill River",
"18" : "Saline River",
"19" : "Big Creek",
"20" : "Hackberry Creek",
"21" : "Ladder Creek",
"22" : "N F Smoky Hill River",
"23" : "Solomon River",
"24" : "Salt Creek",
"25" : "S F Solomon River",
"26" : "N F Solomon River",
"27" : "Republican River",
"28" : "Prairie Dog Creek",
"29" : "Sappa Creek",
"30" : "Beaver Creek",
"31" : "S F Republican River",
"32" : "Arikaree River",
"33" : "Arkansas River",
"34" : "Neosho River",
"35" : "Spring River",
"36" : "Cottonwood River",
"37" : "Verdigris River",
"38" : "Caney River",
"39" : "Elk River",
"40" : "Fall River",
"41" : "Cimarron River",
"42" : "Bluff Creek (cimarron)",
"43" : "Crooked Creek",
"44" : "N F Cimarron River",
"45" : "Bear Creek",
"46" : "Salt Fork Arkansas River",
"47" : "Medicine Lodge River",
"48" : "Chikaskia River",
"49" : "Bluff Creek (chikaskia)",
"50" : "Sandy Creek",
"51" : "Walnut River",
"52" : "Ninnescah River",
"53" : "N F Ninnescah River",
"54" : "S F Ninnescah River",
"55" : "Little Arkansas River",
"56" : "Cow Creek",
"57" : "Rattlesnake Creek",
"58" : "Walnut Creek",
"59" : "Pawnee River",
"60" : "Buckner Creek",
"61" : "Whitewoman Creek",
"62" : "Driftwood Creek"
}

def retrieveBasin(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = basinDict[String1]
        except:
            outList = "WaDE Unspecified"
    return outList

dfinPOD['in_WaterSourceName'] = dfinPOD.apply(lambda row: retrieveBasin(row['basin']), axis=1)
dfinPOD['in_WaterSourceName'].unique()

array(['Saline River', 'Kansas River', 'N F Solomon River',
       'Arkansas River', 'Cow Creek', 'Ladder Creek', 'Whitewoman Creek',
       'S F Republican River', 'Cimarron River', 'Bear Creek',
       'Crooked Creek', 'Pawnee River', 'Republican River',
       'Buckner Creek', 'Walnut Creek', 'Rattlesnake Creek',
       'Little Arkansas River', 'Little Blue River', 'Smoky Hill River',
       'S F Big Nemaha River', 'S F Solomon River', 'N F Cimarron River',
       'Beaver Creek', 'S F Ninnescah River', 'Hackberry Creek',
       'Prairie Dog Creek', 'Neosho River', 'Sappa Creek',
       'N F Smoky Hill River', 'N F Ninnescah River', 'Big Creek',
       'Medicine Lodge River', 'Vermillion Creek', 'Salt Creek',
       'Solomon River', 'Marais Des Cygnes River',
       'Bluff Creek (cimarron)', 'Caney River', 'Delaware River',
       'Salt Fork Arkansas River', 'Chikaskia River', 'Ninnescah River',
       'Verdigris River', 'Cottonwood River', 'Pottawatomie Creek',
       'Walnut River'

In [11]:
#County Code
countyDict = {
"AL" : "Allen",
"AN" : "Anderson",
"AT" : "Atchison",
"BA" : "Barber",
"BT" : "Barton",
"BB" : "Bourbon",
"BR" : "Brown",
"BU" : "Butler",
"CS" : "Chase",
"CQ" : "Chautauqua",
"CK" : "Cherokee",
"CN" : "Cheyenne",
"CA" : "Clark",
"CY" : "Clay",
"CD" : "Cloud",
"CF" : "Coffey",
"CM" : "Comanche",
"CL" : "Cowley",
"CR" : "Crawford",
"DC" : "Decatur",
"DK" : "Dickinson",
"DP" : "Doniphan",
"DG" : "Douglas",
"ED" : "Edwards",
"EK" : "Elk",
"EL" : "Ellis",
"EW" : "Ellsworth",
"FI" : "Finney",
"FO" : "Ford",
"FR" : "Franklin",
"GE" : "Geary",
"GO" : "Gove",
"GH" : "Graham",
"GT" : "Grant",
"GY" : "Gray",
"GL" : "Greeley",
"GW" : "Greenwood",
"HM" : "Hamilton",
"HP" : "Harper",
"HV" : "Harvey",
"HS" : "Haskell",
"HG" : "Hodgeman",
"JA" : "Jackson",
"JF" : "Jefferson",
"JW" : "Jewell",
"JO" : "Johnson",
"KE" : "Kearny",
"KM" : "Kingman",
"KW" : "Kiowa",
"LB" : "Labette",
"LE" : "Lane",
"LV" : "Leavenworth",
"LC" : "Lincoln",
"LN" : "Linn",
"LG" : "Logan",
"LY" : "Lyon",
"MN" : "Marion",
"MS" : "Marshall",
"MP" : "McPherson",
"ME" : "Meade",
"MI" : "Miami",
"MC" : "Mitchell",
"MG" : "Montgomery",
"MR" : "Morris",
"MT" : "Morton",
"NM" : "Nemaha",
"NO" : "Neosho",
"NS" : "Ness",
"NT" : "Norton",
"OS" : "Osage",
"OB" : "Osborne",
"OT" : "Ottawa",
"PN" : "Pawnee",
"PL" : "Phillips",
"PT" : "Pottawatomie",
"PR" : "Pratt",
"RA" : "Rawlins",
"RN" : "Reno",
"RP" : "Republic",
"RC" : "Rice",
"RL" : "Riley",
"RO" : "Rooks",
"RH" : "Rush",
"RS" : "Russell",
"SA" : "Saline",
"SC" : "Scott",
"SG" : "Sedgwick",
"SW" : "Seward",
"SN" : "Shawnee",
"SD" : "Sheridan",
"SH" : "Sherman",
"SM" : "Smith",
"SF" : "Stafford",
"ST" : "Stanton",
"SV" : "Stevens",
"SU" : "Sumner",
"TH" : "Thomas",
"TR" : "Trego",
"WB" : "Wabaunsee",
"WA" : "Wallace",
"WS" : "Washington",
"WH" : "Wichita",
"WL" : "Wilson",
"WO" : "Woodson",
"WY" : "Wyandotte"}

def retrieveCounty(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = countyDict[String1]
        except:
            outList = "WaDE Unspecified"
    return outList

dfinPOD['in_County'] = dfinPOD.apply(lambda row: retrieveCounty(row['county']), axis=1)
dfinPOD['in_County'].unique()

array(['Ottawa', 'Johnson', 'Osborne', 'Hamilton', 'Ford', 'Rice',
       'Wichita', 'Wallace', 'Cheyenne', 'Seward', 'Haskell', 'Finney',
       'Morton', 'Shawnee', 'Cloud', 'Scott', 'Gray', 'Lane', 'Sedgwick',
       'Stevens', 'Edwards', 'Republic', 'Greeley', 'Kearny', 'Graham',
       'Nemaha', 'Thomas', 'Saline', 'Grant', 'Stanton', 'Rush', 'Pratt',
       'Meade', 'Phillips', 'Sherman', 'Sheridan', 'Kiowa', 'McPherson',
       'Morris', 'Reno', 'Stafford', 'Kingman', 'Ellis', 'Rawlins',
       'Clark', 'Pawnee', 'Logan', 'Barton', 'Hodgeman', 'Gove', 'Barber',
       'Smith', 'Geary', 'Ellsworth', 'Norton', 'Marshall', 'Jewell',
       'Ness', 'Clay', 'Russell', 'Pottawatomie', 'Douglas', 'Mitchell',
       'Lincoln', 'Lyon', 'Osage', 'Comanche', 'Trego', 'Coffey',
       'Decatur', 'Cowley', 'Jefferson', 'Harvey', 'Dickinson', 'Harper',
       'Washington', 'Linn', 'Wilson', 'Marion', 'Allen', 'Franklin',
       'Greenwood', 'Butler', 'Leavenworth', 'Sumner', 'Neosho', 'Riley'

In [12]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "KSwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "KSwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "KSwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['in_WaterSourceName']
df['in_WaterSourceNativeID'] = dfinPOD['basin'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_WaterSourceTypeCV'] = dfinPOD['in_WatersourceType']

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Unspecified"
df['in_CoordinateMethodCV'] = "WaDE Unspecified"
df['in_County'] = dfinPOD['in_County']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['latitude']
df['in_Longitude'] = dfinPOD['longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = "WaDE Unspecified"
df['in_SiteNativeID'] = "POD" + dfinPOD['pdiv_id_y'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE Unspecified"
df['in_StateCV'] = "KS"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] = dfinPOD['in_AllocationLegalStatusCV']
df['in_AllocationNativeID'] =  dfinPOD['wr_id_x'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = "WaDE Unspecified"
df['in_AllocationPriorityDate'] = dfinPOD['priority_date']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfinPOD['in_AllocationTypeCV']
df['in_AllocationVolume_AF'] = dfinPOD['auth_quant']
df['in_BeneficialUseCategory'] = dfinPOD['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "http://geohydro.kgs.ku.edu/geohydro/wimas/water_right_list_direct.cfm?wr_id=" + dfinPOD['wr_id_x'].replace("", 0).fillna(0).astype(int).astype(str)

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

45666


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,ksqID0,KSwr_M1,KSwr_V1,KSwr_O1,,,,Saline River,18,Groundwater,WaDE Unspecified,WaDE Unspecified,Ottawa,4326,,,,38.9982,-97.87295,,,POD,WaDE Unspecified,POD22899,,WaDE Unspecified,KS,,,,,,,,,,,Certificated Issued,36,WaDE Unspecified,07-JAN-1943,,,,Appropriation,134.0,Irrigation,,,,,,0,,,,,,,,,,http://geohydro.kgs.ku.edu/geohydro/wimas/wate...
1,ksqID1,KSwr_M1,KSwr_V1,KSwr_O1,,,,Kansas River,8,Surface Water,WaDE Unspecified,WaDE Unspecified,Johnson,4326,,,,38.99368,-94.99305,,,POD,WaDE Unspecified,POD41446,,WaDE Unspecified,KS,,,,,,,,,,,Certificated Issued,37,WaDE Unspecified,09-JAN-1943,,,,Appropriation,42522.503,Industrial,,,,,,0,,,,,,,,,,http://geohydro.kgs.ku.edu/geohydro/wimas/wate...
2,ksqID10,KSwr_M1,KSwr_V1,KSwr_O1,,,,N F Solomon River,26,Surface Water,WaDE Unspecified,WaDE Unspecified,Osborne,4326,,,,39.49494,-98.56293,,,POD,WaDE Unspecified,POD7810,,WaDE Unspecified,KS,,,,,,,,,,,Certificated Issued,73,WaDE Unspecified,11-JAN-1946,,,,Appropriation,200.0,Irrigation,,,,,,0,,,,,,,,,,http://geohydro.kgs.ku.edu/geohydro/wimas/wate...
3,ksqID100,KSwr_M1,KSwr_V1,KSwr_O1,,,,Arkansas River,33,Groundwater,WaDE Unspecified,WaDE Unspecified,Hamilton,4326,,,,37.99599,-101.816,,,POD,WaDE Unspecified,POD63758,,WaDE Unspecified,KS,,,,,,,,,,,Certificated Issued,228,WaDE Unspecified,13-JUN-1947,,,,Appropriation,216.0,Irrigation,,,,,,0,,,,,,,,,,http://geohydro.kgs.ku.edu/geohydro/wimas/wate...
4,ksqID1000,KSwr_M1,KSwr_V1,KSwr_O1,,,,Arkansas River,33,Surface Water,WaDE Unspecified,WaDE Unspecified,Ford,4326,,,,37.63564,-99.77663,,,POD,WaDE Unspecified,POD31840,,WaDE Unspecified,KS,,,,,,,,,,,Certificated Issued,1911,WaDE Unspecified,01-OCT-1953,,,,Appropriation,150.0,Irrigation,,,,,,0,,,,,,,,,,http://geohydro.kgs.ku.edu/geohydro/wimas/wate...


## Concatenate and Clean Data

In [13]:
# Concatenate dataframes
frames = [outPOD]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

45666


In [14]:
# Fixing empty string names

def fixEmptyString(val):
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [15]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Saline River', 'Kansas River', 'N F Solomon River',
       'Arkansas River', 'Cow Creek', 'Ladder Creek', 'Whitewoman Creek',
       'S F Republican River', 'Cimarron River', 'Bear Creek',
       'Crooked Creek', 'Pawnee River', 'Republican River',
       'Buckner Creek', 'Walnut Creek', 'Rattlesnake Creek',
       'Little Arkansas River', 'Little Blue River', 'Smoky Hill River',
       'S F Big Nemaha River', 'S F Solomon River', 'N F Cimarron River',
       'Beaver Creek', 'S F Ninnescah River', 'Hackberry Creek',
       'Prairie Dog Creek', 'Neosho River', 'Sappa Creek',
       'N F Smoky Hill River', 'N F Ninnescah River', 'Big Creek',
       'Medicine Lodge River', 'Vermillion Creek', 'Salt Creek',
       'Solomon River', 'Marais Des Cygnes River',
       'Bluff Creek (cimarron)', 'Caney River', 'Delaware River',
       'Salt Fork Arkansas River', 'Chikaskia River', 'Ninnescah River',
       'Verdigris River', 'Cottonwood River', 'Pottawatomie Creek',
       'Walnut River'

In [16]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water'], dtype=object)

In [17]:
outdf['in_County'] = outdf.apply(lambda row: fixEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Ottawa', 'Johnson', 'Osborne', 'Hamilton', 'Ford', 'Rice',
       'Wichita', 'Wallace', 'Cheyenne', 'Seward', 'Haskell', 'Finney',
       'Morton', 'Shawnee', 'Cloud', 'Scott', 'Gray', 'Lane', 'Sedgwick',
       'Stevens', 'Edwards', 'Republic', 'Greeley', 'Kearny', 'Graham',
       'Nemaha', 'Thomas', 'Saline', 'Grant', 'Stanton', 'Rush', 'Pratt',
       'Meade', 'Phillips', 'Sherman', 'Sheridan', 'Kiowa', 'McPherson',
       'Morris', 'Reno', 'Stafford', 'Kingman', 'Ellis', 'Rawlins',
       'Clark', 'Pawnee', 'Logan', 'Barton', 'Hodgeman', 'Gove', 'Barber',
       'Smith', 'Geary', 'Ellsworth', 'Norton', 'Marshall', 'Jewell',
       'Ness', 'Clay', 'Russell', 'Pottawatomie', 'Douglas', 'Mitchell',
       'Lincoln', 'Lyon', 'Osage', 'Comanche', 'Trego', 'Coffey',
       'Decatur', 'Cowley', 'Jefferson', 'Harvey', 'Dickinson', 'Harper',
       'Washington', 'Linn', 'Wilson', 'Marion', 'Allen', 'Franklin',
       'Greenwood', 'Butler', 'Leavenworth', 'Sumner', 'Neosho', 'Riley'

In [18]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['Certificated Issued', 'Reinstated After Certificate Issued',
       'Inspected Pending Perfection Extended Time to Perfect',
       'Proposed Certificate', 'Completed Extended Time to Perfect',
       'Inspected Pending Perfection',
       'Proposed Certificate Extended Time to Perfect', 'Vested Active',
       'Reinstated After Vested', 'Completed Pending Inspection',
       'Reinstated Pending Perfection', 'Extended Time to Complete',
       'Approved Pending Completion',
       'Partial Inspection Extended Time to Perfect',
       'Completed Partial inspection', 'WaDE Unspecified',
       'Partial Completion Extended Time to Complete',
       'Inspected Prior to Completion'], dtype=object)

In [19]:
outdf['in_AllocationTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationTypeCV']), axis=1)
outdf['in_AllocationTypeCV'].unique()

array(['Appropriation', 'Vested'], dtype=object)

In [20]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Irrigation', 'Industrial', 'Municipal', 'Stockwater',
       'Recreation', 'Domestic'], dtype=object)

In [21]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0)
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0)
outdf.head(1)

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,ksqID0,KSwr_M1,KSwr_V1,KSwr_O1,,,,Saline River,18,Groundwater,WaDE Unspecified,WaDE Unspecified,Ottawa,4326,,,,38.9982,-97.87295,,,POD,WaDE Unspecified,POD22899,,WaDE Unspecified,KS,,,,,,,,,,,Certificated Issued,36,WaDE Unspecified,07-JAN-1943,,,,Appropriation,134.0,Irrigation,,,,,,0,,,,,,,,,,http://geohydro.kgs.ku.edu/geohydro/wimas/wate...


In [22]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array(['1943-01-07T00:00:00.000000000', '1943-01-09T00:00:00.000000000',
       '1946-01-11T00:00:00.000000000', ...,
       '1967-07-19T00:00:00.000000000', '1967-07-20T00:00:00.000000000',
       '1967-07-21T00:00:00.000000000'], dtype='datetime64[ns]')

In [23]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

array([0.])

In [24]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

array([1.3400000e+02, 4.2522503e+04, 2.0000000e+02, ..., 2.6700000e-01,
       2.5198600e+02, 1.2719900e+02])

## Review and Export

In [25]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

WaDEUUID                                                object
in_MethodUUID                                           object
in_VariableSpecificUUID                                 object
in_OrganizationUUID                                     object
in_Geometry                                             object
in_GNISFeatureNameCV                                    object
in_WaterQualityIndicatorCV                              object
in_WaterSourceName                                      object
in_WaterSourceNativeID                                  object
in_WaterSourceTypeCV                                    object
in_CoordinateAccuracy                                   object
in_CoordinateMethodCV                                   object
in_County                                               object
in_EPSGCodeCV                                            int64
in_GNISCodeCV                                           object
in_HUC12                                               

In [26]:
# Export the output dataframe
outdf.to_csv('Pwr_ksMain.zip', index=False, compression="zip")  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.