# Pre-processing Montana Water Right data for WaDEQA upload.

Date Updated: 11/30/2020

Purpose:  To pre-process the Montana data into one master file for simple DataFrame creation and extraction.

In [None]:
# Needed Libararies
import os
import numpy as np
import pandas as pd
import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

# Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Montana/WaterAllocation/RawInputData"
os.chdir(workingDir)

In [None]:
# columns needed for combined output Dataframe
columnsList = ["in_GNISFeatureNameCV",	"in_WaterQualityIndicatorCV",	"in_WaterSourceName",	"in_WaterSourceNativeID",	"in_WaterSourceTypeCV",
               "in_CoordinateAccuracy",	"in_CoordinateMethodCV",	"in_County",	"in_EPSGCodeCV",	"in_Geometry",	"in_GNISCodeCV",	"in_HUC12",	"in_HUC8",	"in_Latitude",	"in_Longitude",	"in_NHDNetworkStatusCV",	"in_NHDProductCV",	"in_PODorPOUSite",	"in_SiteName",	"in_SiteNativeID",	"in_SitePoint",	"in_SiteTypeCV",	"in_StateCV",	"in_USGSSiteID",
               "in_AllocationApplicationDate",	"in_AllocationAssociatedConsumptiveUseSiteIDs",	"in_AllocationAssociatedWithdrawalSiteIDs",	"in_AllocationBasisCV",	"in_AllocationChangeApplicationIndicator",	"in_AllocationCommunityWaterSupplySystem",	"in_AllocationCropDutyAmount",	"in_AllocationExpirationDate",	"in_AllocationFlow_CFS",	"in_AllocationLegalStatusCV",	"in_AllocationNativeID",	"in_AllocationOwner",	"in_AllocationPriorityDate",	"in_AllocationSDWISIdentifierCV",	"in_AllocationTimeframeEnd",	"in_AllocationTimeframeStart",	"in_AllocationTypeCV",	"in_AllocationVolume_AF",	"in_BeneficialUseCategory",	"in_CommunityWaterSupplySystem",	"in_CropTypeCV",	"in_CustomerTypeCV",	"in_DataPublicationDate",	"in_DataPublicationDOI",	"in_ExemptOfVolumeFlowPriority",	"in_GeneratedPowerCapacityMW",	"in_IrrigatedAcreage",	"in_IrrigationMethodCV",	"in_LegacyAllocationIDs",	"in_OrganizationUUID",	"in_PopulationServed",	"in_PowerType",	"in_PrimaryUseCategory",	"in_VariableSpecificUUID",	"in_WaterAllocationNativeURL"]

## POD Water Budget Data

In [None]:
# CSV input file
fileInput = "WaDE_PODs_input.csv"
df = pd.read_csv(fileInput)
print(len(df))
df.head(3)

In [None]:
# WaterSourceTypeCV
waterSourceTypeDict = {
"SURFACE" : "Surface Water",
"GROUNDWATER" : "Groundwater",
"ALL NATURALLY OCCURING WATER" : "Surface Water"
}

def retrieveWaterSourceTypeCV(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "Unspecified"
    else:
        String1 = colrowValue.strip()
        try:
            outList = waterSourceTypeDict[String1]
        except:
            outList = "Unspecified"
    return outList

df['WaterSourceTypeCV'] = df.apply(lambda row: retrieveWaterSourceTypeCV(row['SOURCE_TYPE']), axis=1)
df.head(3)

In [None]:
# Update datatype of Priority Date to fit WaDE 2.0 structure

df['ENF_PRIORITY_DATE'] = pd.to_datetime(df['ENF_PRIORITY_DATE'])
df['ENF_PRIORITY_DATE'] = pd.to_datetime(df["ENF_PRIORITY_DATE"].dt.strftime('%m/%d/%Y'))
df.head(3)

In [None]:
# Creating easy MethodTypeCV retreival for AllocationsAmounts_fact sheet.

x = datetime.datetime(1973, 7, 1)
x

def createMethodTypeCV(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        if colrowValue < x:
            outList = "Adjudication"
        else:
            outList = "Appropriations"
    return outList

df['MethodTypeCV'] = df.apply(lambda row: createMethodTypeCV(row['ENF_PRIORITY_DATE']), axis=1)
df.head(3)

In [None]:
# Creating TimeframeStart.
# Spliting string, returning WaDE friendly format.

MonthNumbDict = {
"Jan" : "01",
"Feb" : "02",
"Mar" : "03",
"May" : "04",
"Apr" : "05",
"Jun" : "06",
"Jul" : "07",
"Aug" : "08",
"Sep" : "09",
"Oct" : "10",
"Nov" : "11",
"Dec" : "12"}

def createTimeframeStart(ColRowVal):
    val = str(ColRowVal)
    day = val.split('-')[0]
    try:
        month = val.split('-')[1]
        month = MonthNumbDict[month] + "/"
    except:
        day = ""
        month = ""
    outlist = month + day
    return outlist

df['TimeframeStart'] = df.apply(lambda row: createTimeframeStart(row['PER_DIV_BGN_DT']), axis=1)
df.head(3)

In [None]:
# Creating TimeframeEnd.
# Spliting string, returning WaDE friendly format.

MonthNumbDict = {
"Jan" : "01",
"Feb" : "02",
"Mar" : "03",
"May" : "04",
"Apr" : "05",
"Jun" : "06",
"Jul" : "07",
"Aug" : "08",
"Sep" : "09",
"Oct" : "10",
"Nov" : "11",
"Dec" : "12"}

def createTimeframeEnd(ColRowVal):
    val = str(ColRowVal)
    day = val.split('-')[0]
    try:
        month = val.split('-')[1]
        month = MonthNumbDict[month] + "/"
    except:
        day = ""
        month = ""
    outlist = month + day
    return outlist

df['TimeframeEnd'] = df.apply(lambda row: createTimeframeEnd(row['PER_DIV_END_DT']), axis=1)
df.head(3)

In [None]:
#Creating the output Dataframe for PODs.

dfPOD = pd.DataFrame(columns=columnsList)

#Method
dfPOD["in_MethodTypeCV"] = df['MethodTypeCV']

# Water Source
dfPOD["in_WaterSourceName"] = df['SOURCE_NAME']
dfPOD["in_WaterSourceTypeCV"] = df['WaterSourceTypeCV']

# Site
dfPOD["in_CoordinateAccuracy"] = "Unspecified"
dfPOD["in_CoordinateMethodCV"] = "Unspecified"
dfPOD["in_County"] = df['LLDS_COUNTY_NAME']
dfPOD["in_HUC12"] = df['HUC_12']
dfPOD["in_Latitude"] = df['Y']
dfPOD["in_Longitude"] = df['X']
dfPOD["in_PODorPOUSite"] = "POD"
dfPOD["in_SiteName"] = df['DITCH_NAME']
dfPOD["in_SiteNativeID"] = df['PODV_ID_SEQ']
dfPOD["in_SiteTypeCV"] = df['MEANS_OF_DIV']

# Allocation
dfPOD["in_AllocationFlow_CFS"] = df['FLW_RT_CFS']
dfPOD["in_AllocationLegalStatusCV"] = df['WR_STATUS']
dfPOD["in_AllocationNativeID"] = df['WR_NUMBER']
dfPOD["in_AllocationOwner"] = df['ALL_OWNERS']
dfPOD["in_AllocationPriorityDate"] = df['ENF_PRIORITY_DATE'] 
dfPOD["in_AllocationTimeframeEnd"] = df['TimeframeEnd']
dfPOD["in_AllocationTimeframeStart"] = df['TimeframeStart']
dfPOD["in_AllocationTypeCV"] = df['WR_TYPE']
dfPOD["in_AllocationVolume_AF"] = df['VOLUME']
dfPOD["in_BeneficialUseCategory"] = df['PURPOSES']
dfPOD["in_DataPublicationDOI"] = df['ABST_LINK']
dfPOD["in_IrrigatedAcreage"] = df['MAX_ACRES']

print(len(dfPOD))

In [None]:
# POD Shapefile Data
ShapeFileInput = gpd.read_file('PODShp/MT_POD.shp')
dfPODshapetemp = pd.DataFrame(ShapeFileInput)
dfPODshapetemp.head(3)

In [None]:
columnsList = ['WR_NUMBER', 'POD_NO', 'DITCH_NAME', 'geometry']
dfPODshape = pd.DataFrame(columns=columnsList)
dfPODshape['WR_NUMBER'] = dfPODshapetemp['WR_NUMBER']
dfPODshape['POD_NO'] = dfPODshapetemp['POD_NO']
dfPODshape['DITCH_NAME'] = dfPODshapetemp['DITCH_NAME']
dfPODshape['geometry'] = dfPODshapetemp['geometry']
dfPODshape = dfPODshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPODshape.head(3)

## PoU Water Budget Data

In [None]:
# CSV input file
fileInput = "WaDE_PoUs_input.csv"
df = pd.read_csv(fileInput)
print(len(df))
df.head(3)

In [None]:
# Fixing datatypes of inputs.
df['ENF_PRIORI'] = pd.to_datetime(df['ENF_PRIORI'])

In [None]:
# WaterSourceTypeCV

waterSourceTypeDict = {
    "GROUNDWATER" : "Groundwater",
    "SURFACE" : "Surface Water"
}

def retrieveWaterSourceTypeCV(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "Unspecified"
    else:
        String1 = colrowValue.strip()
        try:
            outList = waterSourceTypeDict[String1]
        except:
            outList = "Unspecified"
    return outList

df['WaterSourceTypeCV'] = df.apply(lambda row: retrieveWaterSourceTypeCV(row['SRCTYPE']), axis=1)
df.head(3)

In [None]:
# Creating easy MethodTypeCV retreival for AllocationsAmounts_fact sheet.

x = datetime.datetime(1973, 7, 1)
x

def createMethodTypeCV(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        if colrowValue < x:
            outList = "Adjudication"
        else:
            outList = "Appropriations"
    return outList

df['MethodTypeCV'] = df.apply(lambda row: createMethodTypeCV(row['ENF_PRIORI']), axis=1)
df.head(3)

In [None]:
# Creating TimeframeStart.
# Spliting string, returning WaDE friendly format.

MonthNumbDict = {
"Jan" : "01",
"Feb" : "02",
"Mar" : "03",
"May" : "04",
"Apr" : "05",
"Jun" : "06",
"Jul" : "07",
"Aug" : "08",
"Sep" : "09",
"Oct" : "10",
"Nov" : "11",
"Dec" : "12"}

def createTimeframeStart(ColRowVal):
    val = str(ColRowVal)
    day = val.split('-')[0]
    try:
        month = val.split('-')[1]
        month = MonthNumbDict[month] + "/"
    except:
        day = ""
        month = ""
    outlist = month + day
    return outlist

df['TimeframeStart'] = df.apply(lambda row: createTimeframeStart(row['PER_USE_BG']), axis=1)
df.head(3)

In [None]:
# Creating TimeframeEnd.
# Spliting string, returning WaDE friendly format.

MonthNumbDict = {
"Jan" : "01",
"Feb" : "02",
"Mar" : "03",
"May" : "04",
"Apr" : "05",
"Jun" : "06",
"Jul" : "07",
"Aug" : "08",
"Sep" : "09",
"Oct" : "10",
"Nov" : "11",
"Dec" : "12"}

def createTimeframeEnd(ColRowVal):
    val = str(ColRowVal)
    day = val.split('-')[0]
    try:
        month = val.split('-')[1]
        month = MonthNumbDict[month] + "/"
    except:
        day = ""
        month = ""
    outlist = month + day
    return outlist

df['TimeframeEnd'] = df.apply(lambda row: createTimeframeEnd(row['PER_USE_EN']), axis=1)
df.head(3)

In [None]:
# Creating the output Dataframe for POUs.

dfPOU = pd.DataFrame(columns=columnsList)

# Method
dfPOU["in_MethodTypeCV"] = df['MethodTypeCV']

# Water Source
dfPOU["in_WaterSourceName"] = df['SOURC_NAME']
dfPOU["in_WaterSourceTypeCV"] = df['WaterSourceTypeCV']

# Site
dfPOU["in_CoordinateAccuracy"] = "Unspecified"
dfPOU["in_CoordinateMethodCV"] = "Centroid"
dfPOU["in_County"] = df['COUNTY']
dfPOU["in_Latitude"] = df['Latitdue']
dfPOU["in_Longitude"] = df['Longitdue']
dfPOU["in_PODorPOUSite"] = "POU"
dfPOU["in_SiteName"] = "Unspecified"
dfPOU["in_SiteTypeCV"] = "Unspecified" 

# Allocation
dfPOU["in_AllocationFlow_CFS"] = df['FLW_RT_CFS']
dfPOU["in_AllocationLegalStatusCV"] = df['STATUS']
dfPOU["in_AllocationNativeID"] = df['WRNUMBER']
dfPOU["in_AllocationOwner"] = df['ALL_OWNERS']
dfPOU["in_AllocationPriorityDate"] = df['ENF_PRIORI']
dfPOU["in_AllocationTimeframeEnd"] = df['TimeframeEnd']
dfPOU["in_AllocationTimeframeStart"] = df['TimeframeStart'] 
dfPOU["in_AllocationTypeCV"] = df['WRTYPE']
dfPOU["in_AllocationVolume_AF"] = df['VOLUME']
dfPOU["in_BeneficialUseCategory"] = df['PURPOSE']
dfPOU["in_DataPublicationDOI"] = df['NRIS_LINK']
dfPOU["in_IrrigatedAcreage"] = df['MAX_ACRES']

print(len(dfPOU))

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEMT_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfPOU['in_Latitude']
dfSiteNativeID['in_Longitude'] = dfPOU['in_Longitude']
dfSiteNativeID['in_SiteTypeCV'] = dfPOU['in_SiteTypeCV']
dfSiteNativeID['in_SiteName'] = dfPOU['in_SiteName']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B, C, D):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B) &
                                (dfSiteNativeID['in_SiteTypeCV'] == C) &
                                (dfSiteNativeID['in_SiteName'] == D), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfPOU['in_SiteNativeID'] = dfPOU.apply(lambda row: retrieveSiteNativeID( row['in_Latitude'], row['in_Longitude'], row['in_SiteTypeCV'], row['in_SiteName']), axis=1)
dfPOU

In [None]:
# PoU Shapefile Data
# Shapefile input
ShapeFileInput = gpd.read_file('PoUShp/MT_PoU.shp')
dfPoUshapetemp = pd.DataFrame(ShapeFileInput)
dfPoUshapetemp.head(3)

In [None]:
columnsList = ['WR_NUMBER', 'POU_NO', 'DITCH_NAME', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['WR_NUMBER'] = dfPoUshapetemp['WRNUMBER']
dfPoUshape['POD_NO'] = dfPoUshapetemp['POU_NO']
dfPoUshape['DITCH_NAME'] = "Unspecified"
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

### Concatenate and Export

In [None]:
# Merge dataframes
frames = [dfPOD, dfPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates()
print(len(outdf))

In [None]:
# Concatenate dataframes
frames = [dfPODshape, dfPoUshape]
dfshape = pd.concat(frames)
print(len(dfshape))

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEMT_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf

In [None]:
# Export out to CSV.
outdf.to_csv('P_MontanaMaster.csv', index=False) # The output.
dfshape.to_csv('P_MontanaGeometry.csv', index=False) # The output geometry.

In [None]:
outdf.head(10)

In [None]:
outdf.tail(5)