# Pre-processing North Dakota for Site-Specific Division & Withdrawl Site data for WaDE upload
- Purpose:  To pre-process the data into one master file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory

# set working directory, if need be
workingDir = "RawInputData"
os.chdir(workingDir)
print(os.getcwd())

C:\Users\rjame\Documents\WSWC Documents\MappingStatesDataToWaDE2.0\NorthDakota\SS_DiversionsWithdrawalsWaterUse\RawInputData


## Inputs and Dataframe Creation
- Water_Use timeseries data
- POD site data
- Permit header data (to bridge timeseries and pod site)

In [3]:
# Input File - Timeseries water use data
fileInput = "Water_Use.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

dfin1['Permit_Index'] = dfin1['Permit_Index'].replace("", 0).astype('Int64').astype('str')
dfin1['POD_Index'] = dfin1['POD_Index'].replace("", 0).astype('Int64').astype('str')
dfin1['Use_Year'] = dfin1['Use_Year'].replace("", 0).astype('Int64').astype('str')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv('Water_Use.zip', compression=dict(method='zip', archive_name='Water_Use.csv'), index=False)

print(len(dfin1))
dfin1.head()

  dfin1 = pd.read_csv(fileInput).replace(np.nan, "")


206166


Unnamed: 0,Permit_Index,POD_Index,Use_Year,Nature_Of_Data,Reported_AcFt,Reported_Acres,Reported_Rate,KWHrs,KWH_Demand,Pump_HP,Begin_Meter,End_Meter,Meter_Units,Comments,NonConsumptive_Use,Crop_type1,Crop_Type2,Reported_Inches,Water_Use_Index,Use_Type,WaDEUUID
0,344,484,1991,,51.9,0.0,900.0,0.0,0.0,70.0,0.0,0.0,Hours,"16,902,000 gallons of water reported used.",0.0,Corn,,,0.0,Irrigation,ndWU0
1,4413,7576,1991,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,,,1.0,Fish and Wildlife,ndWU1
2,4413,7576,1992,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,,,2.0,Fish and Wildlife,ndWU2
3,344,484,1992,,20.3,106.0,1000.0,0.0,0.0,82.0,1915.0,2025.0,Hours,,0.0,,,,3.0,Irrigation,ndWU3
4,2,2,1976,No Form Received,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,,,4.0,Irrigation,ndWU4


In [4]:
# Input File - POD site data
fileInput = "POD.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

dfin2['Permit_Index'] = dfin2['Permit_Index'].replace("", 0).astype('Int64').astype('str')
dfin2['POD_Index'] = dfin2['POD_Index'].replace("", 0).astype('Int64').astype('str')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv('POD.zip', compression=dict(method='zip', archive_name='POD.csv'), index=False)

print(len(dfin2))
dfin2.head()

5475


Unnamed: 0,Permit_Index,POD_Index,POD_Status,POD,Longitude,Latitude,X_Coord,Y_Coord,Beneficial_Use,County,Column1,_1,Aquifer,SubAquifer,Req_AcFt,Req_Acre,Req_Rate,Req_Storage,App_AcFt,App_Acre,App_Rate,App_Storage,Source,Irrigation_Type,Source_Name,MainStem_Name,Impound_Location,Impound_Name,Return_Dest,Discharge_Locat,Prop_Owner,Dest_Prop_Owner,Period_Start,Period_End,Return_Quantity,Held_AcFt,Held_Acre,Held_Rate,Held_Storage,Longitude.1,Latitude.1,HU_Basin,HU_Sub_Basin,HU_Watershed,HU_Sub_Watershed,Civil_Township,NonConsumptive_ReqAcFt,NonConsumptive_AppAcFt,NonConsumptive_HeldAcFt,WaDEUUID
0,967,1578,Active,12910124,-103.25506,-103.25506,1268199,127061,07/01/75,Bowman,,,,,3000.0,0.0,0.0,26000.0,3000.0,0.0,0.0,26000.0,Surface Water,,North Fork Grand River and Spring Creek,,,,,,,,January 1,December 31,0,0,0,0,0,-103.25506,45.9814,Grand-Moreau,North Fork Grand,Headwaters North Fork Grand River,Bowman-Haley Dam-North Fork Grand River,Minnehaha,0,0,0,ndPD1577
1,258,360,Active,12910628,-103.94025,-103.94025,1093899,128645,01/01/52,Bowman,,,,,36.1,18.1,200.0,0.0,36.1,18.1,200.0,0.0,Surface Water,Sprinkler,Little Missouri River,,,,,,,,-,-,0,0,0,0,0,-103.94025,45.9669,Little Missouri,Upper Little Missouri,Slick Creek-Little Missouri River,Dogie Creek-Little Missouri River,Unorganized Territory,0,0,0,ndPD359
2,1554,2550,Active,12910633,-103.94037,-103.94037,1093637,123352,07/01/76,Bowman,,,,,184.5,145.0,1400.0,0.0,184.5,145.0,1400.0,0.0,Surface Water,Sprinkler,Little Missouri River,,,,,,,,,,0,0,0,0,0,-103.94037,45.95237,Little Missouri,Upper Little Missouri,Slick Creek-Little Missouri River,Dogie Creek-Little Missouri River,Unorganized Territory,0,0,0,ndPD2549
3,3349,6043,Active,13005836,-97.8939,-97.8939,2630377,143455,07/01/85,Sargent,,,Sand Sediments,,204.0,136.0,900.0,0.0,204.0,136.0,900.0,0.0,Ground Water,Sprinkler,,,,,,,,,,,0,0,0,0,0,-97.8939,46.02992,Upper Red,Western Wild Rice,Ditch No 11,Ditch No 11,Jackson,0,0,0,ndPD6041
4,4554,7840,Active,13007909,-100.57653,-100.57653,1949089,156403,12/06/19,Emmons,,,Winona,,1024.0,502.4,3584.0,0.0,525.0,350.0,2400.0,0.0,Ground Water,Sprinkler,,,,,,,,,,,0,0,0,0,0,-100.57653,46.09551,Lake Oahe,Upper Lake Oahe,Town of Fort Yates,Saint Lukes Cemetery-Lake Oahe,Unorganized Territory,0,0,0,ndPD7838


In [5]:
# Input File - Permit_Header
fileInput = "Permit_Header.zip"
dfin3 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin3:
    dfin3['WaDEUUID'] = "in3" + dfin3.index.astype(str)
    dfin3.to_csv('Permit_Header.zip', compression=dict(method='zip', archive_name='Permit_Header.csv'), index=False)

print(len(dfin3))
dfin3.head()

9050


Unnamed: 0,Permit_Index,Permit_Number,Permit_Holder_Name,Address1,Address2,City,State,Zip,Priority_Date,Use_Type,Status,Date_Issued,Date_Cancelled,Req_AcFt,Req_Acre,Req_Rate,App_AcFt,App_Acre,App_Rate,Beneficial_Use,Reservation,Project_Name,Hearing_Date,Hearing_Time,Const_Perm_No,Start_Date,Comp_Date,No_Notify,Use_Description,NonConsumptive_ReqAcFt,NonConsumptive_AppAcFt,Date_Perfected,Remarks,Req_Storage,App_Storage,Held_AcFt,Held_Acre,Held_Rate,Held_Storage,Comment_Deadline,NonConsumptive_HeldAcFt,Last_Inspected,Depot_ID,WaDEUUID
0,4,2D,"GUDMUNSEN, ROBERT AND LOWRAINE",1952 134TH AVE NW,,ARNEGARD,ND,58835-9162,01/26/1906,Irrigation,Perfected,04/30/37,00/00/00,291.0,291.0,1615.6,291.0,291.0,1615.6,00/00/00,,,00/00/00,00:00:00,,00/00/00,00/00/00,0.0,,0.0,0.0,02/17/93,,0.0,0.0,0.0,0.0,0.0,0.0,00/00/00,0.0,00/00/00,0.0,ndPH0
1,20,13D,"KIRSCH, DONALD",,,,,,06/26/34,Irrigation,Cancelled,00/00/00,00/00/00,105.0,35.0,897.6,0.0,0.0,0.0,00/00/00,,,00/00/00,00:00:00,,00/00/00,00/00/00,0.0,,0.0,0.0,00/00/00,,0.0,0.0,0.0,0.0,0.0,0.0,00/00/00,0.0,00/00/00,0.0,ndPH1
2,5,3C,"HARTEL, LEMOINE",12042 HWY 23,,WATFORD CITY,ND,58854,02/03/1906,Irrigation,Cancelled,01/10/90,06/07/17,240.0,240.0,1350.0,0.0,0.0,0.0,00/00/00,,,00/00/00,00:00:00,,00/00/00,00/00/00,0.0,,0.0,0.0,10/10/94,Letter requesting cancellation received 01 Sep...,0.0,0.0,0.0,0.0,0.0,0.0,00/00/00,0.0,00/00/00,0.0,ndPH2
3,12,9B,"VANCE, E. WALLACE and C. WAYNE",11230 54TH ST NW,,RAY,ND,58849-9241,06/20/1901,Irrigation,Perfected,00/00/00,00/00/00,56.0,56.0,450.0,56.0,56.0,0.0,00/00/00,,,00/00/00,00:00:00,,00/00/00,00/00/00,0.0,,0.0,0.0,10/25/66,,0.0,0.0,0.0,0.0,0.0,0.0,00/00/00,0.0,00/00/00,0.0,ndPH3
4,8,7E,"LASSEY, JERRY AND RODNEY",BOX 27,,CARTWRIGHT,ND,58838,09/21/37,Irrigation,Perfected,00/00/00,00/00/00,80.0,40.0,448.8,51.4,25.7,600.0,00/00/00,,,00/00/00,00:00:00,,00/00/00,00/00/00,0.0,,0.0,0.0,04/22/94,,0.0,0.0,0.0,0.0,0.0,0.0,00/00/00,0.0,00/00/00,0.0,ndPH4


In [6]:
# Left-Join data
dfin = pd.merge(dfin1, dfin2, on='POD_Index', how='left')
dfin = dfin.merge(dfin3, left_on='Permit_Index_x', right_on='Permit_Index', how='left')
dfin = dfin.replace(np.nan, "").reset_index(drop=True)

print(len(dfin))
dfin.head(2)

206166


Unnamed: 0,Permit_Index_x,POD_Index,Use_Year,Nature_Of_Data,Reported_AcFt,Reported_Acres,Reported_Rate,KWHrs,KWH_Demand,Pump_HP,Begin_Meter,End_Meter,Meter_Units,Comments,NonConsumptive_Use,Crop_type1,Crop_Type2,Reported_Inches,Water_Use_Index,Use_Type_x,WaDEUUID_x,Permit_Index_y,POD_Status,POD,Longitude,Latitude,X_Coord,Y_Coord,Beneficial_Use_x,County,Column1,_1,Aquifer,SubAquifer,Req_AcFt_x,Req_Acre_x,Req_Rate_x,Req_Storage_x,App_AcFt_x,App_Acre_x,App_Rate_x,App_Storage_x,Source,Irrigation_Type,Source_Name,MainStem_Name,Impound_Location,Impound_Name,Return_Dest,Discharge_Locat,Prop_Owner,Dest_Prop_Owner,Period_Start,Period_End,Return_Quantity,Held_AcFt_x,Held_Acre_x,Held_Rate_x,Held_Storage_x,Longitude.1,Latitude.1,HU_Basin,HU_Sub_Basin,HU_Watershed,HU_Sub_Watershed,Civil_Township,NonConsumptive_ReqAcFt_x,NonConsumptive_AppAcFt_x,NonConsumptive_HeldAcFt_x,WaDEUUID_y,Permit_Index,Permit_Number,Permit_Holder_Name,Address1,Address2,City,State,Zip,Priority_Date,Use_Type_y,Status,Date_Issued,Date_Cancelled,Req_AcFt_y,Req_Acre_y,Req_Rate_y,App_AcFt_y,App_Acre_y,App_Rate_y,Beneficial_Use_y,Reservation,Project_Name,Hearing_Date,Hearing_Time,Const_Perm_No,Start_Date,Comp_Date,No_Notify,Use_Description,NonConsumptive_ReqAcFt_y,NonConsumptive_AppAcFt_y,Date_Perfected,Remarks,Req_Storage_y,App_Storage_y,Held_AcFt_y,Held_Acre_y,Held_Rate_y,Held_Storage_y,Comment_Deadline,NonConsumptive_HeldAcFt_y,Last_Inspected,Depot_ID,WaDEUUID
0,344,484,1991,,51.9,0.0,900.0,0.0,0.0,70.0,0.0,0.0,Hours,"16,902,000 gallons of water reported used.",0.0,Corn,,,0.0,Irrigation,ndWU0,344.0,Active,13010629DA,-103.95333,-103.95333,1091942.0,159818.0,06/25/63,Bowman,,,,,39.0,44.2,225.0,0.0,39.0,44.2,225.0,0.0,Surface Water,Sprinkler,Little Missouri River,,,,,,,,-,-,0.0,0.0,0.0,0.0,0.0,-103.95333,46.05206,Little Missouri,Upper Little Missouri,Horse Creek-Little Missouri River,Big Gumbo Creek-Little Missouri River,Unorganized Territory,0.0,0.0,0.0,ndPD483,344,465,"NAGLE, THOMAS",1240 ELM STREET,,FARGO,ND,58102,10/02/52,Irrigation,Perfected,12/04/52,00/00/00,78.0,88.4,450.0,78.0,88.4,450.0,00/00/00,,,00/00/00,00:00:00,,00/00/00,00/00/00,0.0,,0.0,0.0,00/00/00,This permit and Permit #572B have a common POD...,0.0,0.0,0.0,0.0,0.0,0.0,00/00/00,0.0,00/00/00,0.0,ndPH2464
1,4413,7576,1991,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,,,1.0,Fish and Wildlife,ndWU1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4413,4696,U.S. FISH AND WILDLIFE SERVICE,PO BOX 25486,DENVER FED CTR,DENVER,CO,80225,04/26/93,Fish and Wildlife,Denied,00/00/00,11/23/93,44.0,0.0,0.0,0.0,0.0,0.0,00/00/00,,,00/00/00,00:00:00,,00/00/00,00/00/00,0.0,,0.0,0.0,00/00/00,KELLY SLOUGH - POOL #4. DENIED DUE TO INACCURA...,0.0,0.0,0.0,0.0,0.0,0.0,00/00/00,0.0,00/00/00,0.0,ndPH259


## DataFrame 1

In [7]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "NDssdw_M1"

# Variable Info
df['in_VariableCV'] = "Withdrawal"
df['in_VariableSpecificUUID'] = "" # auto fill in below

# Organization Info
df['in_OrganizationUUID'] = "NDssdw_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = dfin['Source']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfin['County']
df['in_EPSGCodeCV'] = "4326"
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin['Latitude.1']
df['in_Longitude'] = dfin['Longitude.1']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = "WaDE Blank"
df['in_SiteNativeID'] = dfin['POD_Index'].astype('str')
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE Blank"
df['in_StateCV'] = "ND"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin['Reported_AcFt']
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = dfin['Permit_Number'].astype(str)
df['in_BeneficialUseCategory'] = dfin['Use_Type_x']
df['in_CommunityWaterSupplySystem'] = dfin['Civil_Township']
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfin['Use_Year']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin['Use_Year'] + "/01/01"
df['in_TimeframeStart'] = dfin['Use_Year']  + "/12/31"

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

206166


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableCV,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,ndWU0,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,,Surface Water,,,Bowman,4326,,,,46.05206,-103.95333,,,POD,WaDE Blank,13010629DA,,WaDE Blank,ND,,51.9,,465,Irrigation,Unorganized Territory,,,,,,,,,,,1991,,1991/01/01,1991/12/31
1,ndWU1,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,,,,,,4326,,,,,,,,POD,WaDE Blank,,,WaDE Blank,ND,,0.0,,4696,Fish and Wildlife,,,,,,,,,,,,1991,,1991/01/01,1991/12/31
2,ndWU2,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,,,,,,4326,,,,,,,,POD,WaDE Blank,,,WaDE Blank,ND,,0.0,,4696,Fish and Wildlife,,,,,,,,,,,,1992,,1992/01/01,1992/12/31
3,ndWU3,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,,Surface Water,,,Bowman,4326,,,,46.05206,-103.95333,,,POD,WaDE Blank,13010629DA,,WaDE Blank,ND,,20.3,,465,Irrigation,Unorganized Territory,,,,,,,,,,,1992,,1992/01/01,1992/12/31
4,ndWU4,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,,,,,,4326,,,,,,,,POD,WaDE Blank,,,WaDE Blank,ND,,0.0,,1E,Irrigation,,,,,,,,,,,,1976,,1976/01/01,1976/12/31


## Concatenate DataFrames together

In [8]:
# Concatenate dataframes
frames = [outdf1] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

206166


## Clean Data / data types

In [9]:
# updating in_WaterSourceTypeCV to be more machine readable / WaDE specific
# ----------------------------------------------------------------------------------------------------

def createWaterSourceTypeCV(inWST):
    inWST = str(inWST).strip()
    
    if inWST == "":
        outString = "WaDE Blank"
    elif inWST == "Ground Water":
        outString = "Groundwater"
    else:
        outString =  inWST
      
    return outString

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: createWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'WaDE Blank', 'Groundwater'], dtype=object)

In [10]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [11]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [12]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Wade Blank'], dtype=object)

In [13]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Bowman', '', 'Mckenzie', 'Mchenry', 'Williams', 'Mountrail',
       'Stark', 'Hettinger', 'Ward', 'Burleigh', 'Dunn', 'Grant', 'Cass',
       'Billings', 'Ramsey', 'Adams', 'Benson', 'Griggs', 'Mercer',
       'Mclean', 'Ransom', 'Richland', 'Grand Forks', 'Morton',
       'Golden Valley', 'Pembina', 'Divide', 'Emmons', 'Pierce', 'Dickey',
       'Wells', 'Barnes', 'Lamoure', 'Stutsman', 'Slope', 'Bottineau',
       'Foster', 'Sioux', 'Sargent', 'Rolette', 'Oliver', 'Walsh',
       'Traill', 'Kidder', 'Steele', 'Towner', 'Burke', 'Logan', 'Eddy',
       'Renville', 'Cavalier', 'Nelson', 'Mcintosh', 'Sheridan'],
      dtype=object)

In [14]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [15]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [16]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'WaDE Blank', 'Groundwater'], dtype=object)

In [17]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['WaDE Blank'], dtype=object)

In [18]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Wade Blank'], dtype=object)

In [19]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Bowman', '', 'Mckenzie', 'Mchenry', 'Williams', 'Mountrail',
       'Stark', 'Hettinger', 'Ward', 'Burleigh', 'Dunn', 'Grant', 'Cass',
       'Billings', 'Ramsey', 'Adams', 'Benson', 'Griggs', 'Mercer',
       'Mclean', 'Ransom', 'Richland', 'Grand Forks', 'Morton',
       'Golden Valley', 'Pembina', 'Divide', 'Emmons', 'Pierce', 'Dickey',
       'Wells', 'Barnes', 'Lamoure', 'Stutsman', 'Slope', 'Bottineau',
       'Foster', 'Sioux', 'Sargent', 'Rolette', 'Oliver', 'Walsh',
       'Traill', 'Kidder', 'Steele', 'Towner', 'Burke', 'Logan', 'Eddy',
       'Renville', 'Cavalier', 'Nelson', 'Mcintosh', 'Sheridan'],
      dtype=object)

In [20]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Irrigation', 'Fish and Wildlife', 'Industrial',
       'Power Generation', 'Stock', 'Municipal', 'Recreation',
       'Multiple Use', 'Rural Water', '', 'Flood Control', 'Domestic',
       'Commercial'], dtype=object)

In [21]:
# Ensure Latitude entry is either numireic or blank, no 0 entries
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("").replace(" ", "")
outdf['in_Latitude'].unique()

array([46.05206, '', 47.70176, ..., 48.23680874, 48.27101875, 48.26378804],
      dtype=object)

In [22]:
# Ensure Longitude entry is either numireic or blank, no 0 entries
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("").replace(" ", "")
outdf['in_Longitude'].unique()

array([-103.95333, '', -103.44126, ..., -102.1757952, -97.83329622,
       -97.83335933], dtype=object)

In [23]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').replace(0,"").fillna("")
outdf['in_Amount'].unique()

array([51.9, '', 20.3, ..., 30.31, 116.53, 105.22], dtype=object)

In [24]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

array(['1991-01-01T00:00:00.000000000', '1992-01-01T00:00:00.000000000',
       '1976-01-01T00:00:00.000000000', '1977-01-01T00:00:00.000000000',
       '1978-01-01T00:00:00.000000000', '1979-01-01T00:00:00.000000000',
       '1980-01-01T00:00:00.000000000', '1981-01-01T00:00:00.000000000',
       '1982-01-01T00:00:00.000000000', '1983-01-01T00:00:00.000000000',
       '1984-01-01T00:00:00.000000000', '1985-01-01T00:00:00.000000000',
       '1986-01-01T00:00:00.000000000', '1987-01-01T00:00:00.000000000',
       '1988-01-01T00:00:00.000000000', '1989-01-01T00:00:00.000000000',
       '1990-01-01T00:00:00.000000000', '1993-01-01T00:00:00.000000000',
       '1994-01-01T00:00:00.000000000', '1995-01-01T00:00:00.000000000',
       '1996-01-01T00:00:00.000000000', '1997-01-01T00:00:00.000000000',
       '1967-01-01T00:00:00.000000000', '1968-01-01T00:00:00.000000000',
       '1969-01-01T00:00:00.000000000', '1970-01-01T00:00:00.000000000',
       '1971-01-01T00:00:00.000000000', '1974-01-01

In [25]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

array(['1991-12-31T00:00:00.000000000', '1992-12-31T00:00:00.000000000',
       '1976-12-31T00:00:00.000000000', '1977-12-31T00:00:00.000000000',
       '1978-12-31T00:00:00.000000000', '1979-12-31T00:00:00.000000000',
       '1980-12-31T00:00:00.000000000', '1981-12-31T00:00:00.000000000',
       '1982-12-31T00:00:00.000000000', '1983-12-31T00:00:00.000000000',
       '1984-12-31T00:00:00.000000000', '1985-12-31T00:00:00.000000000',
       '1986-12-31T00:00:00.000000000', '1987-12-31T00:00:00.000000000',
       '1988-12-31T00:00:00.000000000', '1989-12-31T00:00:00.000000000',
       '1990-12-31T00:00:00.000000000', '1993-12-31T00:00:00.000000000',
       '1994-12-31T00:00:00.000000000', '1995-12-31T00:00:00.000000000',
       '1996-12-31T00:00:00.000000000', '1997-12-31T00:00:00.000000000',
       '1967-12-31T00:00:00.000000000', '1968-12-31T00:00:00.000000000',
       '1969-12-31T00:00:00.000000000', '1970-12-31T00:00:00.000000000',
       '1971-12-31T00:00:00.000000000', '1974-12-31

In [26]:
# extract year out
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True)
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
# outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
outdf['in_ReportYearCV'].unique()

array(['1991', '1992', '1976', '1977', '1978', '1979', '1980', '1981',
       '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989',
       '1990', '1993', '1994', '1995', '1996', '1997', '1967', '1968',
       '1969', '1970', '1971', '1974', '1975', '1966', '1972', '1973',
       '1998', '2001', '2000', '1999', '2017', '1965', '2016', '2014',
       '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006',
       '2005', '2004', '2003', '2002', '2018', '0', '2015', '2019',
       '2020', '2021'], dtype=object)

## Create missing WaDE Custom Elements

In [27]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------

def createVariableSpecificCV(inV, inBU, inWST):
    inV = str(inV).strip()
    inBU = str(inBU).strip().title()
    inWST = str(inWST).strip()
    
    outString = inV + "_Annual_" +  inBU + "_" + inWST
    
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                     row['in_BeneficialUseCategory'],
                                                                                     row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Withdrawal_Annual_Irrigation_Surface Water',
       'Withdrawal_Annual_Fish And Wildlife_WaDE Blank',
       'Withdrawal_Annual_Irrigation_WaDE Blank',
       'Withdrawal_Annual_Industrial_WaDE Blank',
       'Withdrawal_Annual_Industrial_Surface Water',
       'Withdrawal_Annual_Power Generation_WaDE Blank',
       'Withdrawal_Annual_Fish And Wildlife_Surface Water',
       'Withdrawal_Annual_Stock_Surface Water',
       'Withdrawal_Annual_Irrigation_Groundwater',
       'Withdrawal_Annual_Municipal_Surface Water',
       'Withdrawal_Annual_Recreation_Surface Water',
       'Withdrawal_Annual_Municipal_WaDE Blank',
       'Withdrawal_Annual_Multiple Use_Surface Water',
       'Withdrawal_Annual_Recreation_WaDE Blank',
       'Withdrawal_Annual_Industrial_Groundwater',
       'Withdrawal_Annual_Municipal_Groundwater',
       'Withdrawal_Annual_Power Generation_Surface Water',
       'Withdrawal_Annual_Rural Water_Groundwater',
       'Withdrawal_Annual_Multiple Use_WaDE Blank',

In [28]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3'], dtype=object)

## Export Outputs

In [29]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206166 entries, 0 to 206165
Data columns (total 49 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   WaDEUUID                          206166 non-null  object        
 1   in_MethodUUID                     206166 non-null  object        
 2   in_VariableCV                     206166 non-null  object        
 3   in_VariableSpecificUUID           206166 non-null  object        
 4   in_OrganizationUUID               206166 non-null  object        
 5   in_Geometry                       206166 non-null  object        
 6   in_GNISFeatureNameCV              206166 non-null  object        
 7   in_WaterQualityIndicatorCV        206166 non-null  object        
 8   in_WaterSourceName                206166 non-null  object        
 9   in_WaterSourceNativeID            206166 non-null  object        
 10  in_WaterSourceTypeCV            

In [30]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableCV,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,ndWU0,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID1,Surface Water,,,Bowman,4326,,,,46.05206,-103.95333,,,POD,Wade Blank,13010629DA,,WaDE Blank,ND,,51.90000,,465,Irrigation,Unorganized Territory,,,,,,,,,,,1991,,1991-01-01,1991-12-31,Withdrawal_Annual_Irrigation_Surface Water
1,ndWU1,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID2,WaDE Blank,,,,4326,,,,,,,,POD,Wade Blank,,,WaDE Blank,ND,,,,4696,Fish and Wildlife,,,,,,,,,,,,1991,,1991-01-01,1991-12-31,Withdrawal_Annual_Fish And Wildlife_WaDE Blank
2,ndWU2,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID2,WaDE Blank,,,,4326,,,,,,,,POD,Wade Blank,,,WaDE Blank,ND,,,,4696,Fish and Wildlife,,,,,,,,,,,,1992,,1992-01-01,1992-12-31,Withdrawal_Annual_Fish And Wildlife_WaDE Blank
3,ndWU3,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID1,Surface Water,,,Bowman,4326,,,,46.05206,-103.95333,,,POD,Wade Blank,13010629DA,,WaDE Blank,ND,,20.30000,,465,Irrigation,Unorganized Territory,,,,,,,,,,,1992,,1992-01-01,1992-12-31,Withdrawal_Annual_Irrigation_Surface Water
4,ndWU4,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID2,WaDE Blank,,,,4326,,,,,,,,POD,Wade Blank,,,WaDE Blank,ND,,,,1E,Irrigation,,,,,,,,,,,,1976,,1976-01-01,1976-12-31,Withdrawal_Annual_Irrigation_WaDE Blank
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206161,ndWU206161,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID1,Surface Water,,,Emmons,4326,,,,45.95999,-100.48496,,,POD,Wade Blank,12907830DD,,WaDE Blank,ND,,,,2792,Irrigation,Unorganized Territory,,,,,,,,,,,2021,,2021-01-01,2021-12-31,Withdrawal_Annual_Irrigation_Surface Water
206162,ndWU206162,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID1,Surface Water,,,Emmons,4326,,,,45.95631,-100.47977,,,POD,Wade Blank,12907832BB,,WaDE Blank,ND,,,,2792,Irrigation,Unorganized Territory,,,,,,,,,,,2021,,2021-01-01,2021-12-31,Withdrawal_Annual_Irrigation_Surface Water
206163,ndWU206163,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID1,Surface Water,,,Mckenzie,4326,,,,47.86634,-103.97437,,,POD,Wade Blank,15110427D,,WaDE Blank,ND,,,,3301A,Irrigation,Yellowstone,,,,,,,,,,,2021,,2021-01-01,2021-12-31,Withdrawal_Annual_Irrigation_Surface Water
206164,ndWU206164,NDssdw_M1,Withdrawal,,NDssdw_O1,,,,,wadeID1,Surface Water,,,Grant,4326,,,,46.20287,-101.45086,,,POD,Wade Blank,13108604BA,,WaDE Blank,ND,,62.90000,,1240,Irrigation,Unorganized Territory,,,,,,,,,,,2021,,2021-01-01,2021-12-31,Withdrawal_Annual_Irrigation_Surface Water


In [31]:
# Export the output dataframe
outdf.to_csv('Pssdw_Main.zip', compression=dict(method='zip', archive_name='Pssro_neMain.csv'), index=False)  # The output, save as a zip