# Preprocessing North Dakota Specific data for WaDEQA upload.
- Date Updated: 05/25/2022

Notes:
- Water Use Data POD data.
- Available data...
    - Permit_Header.csv
    - POD.csv
    - Water_Use.csv
- Match ts water use data -> POD data via POD_Index -> Permit data via Permit_Index.

In [None]:
# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles

# visulizaiton
import matplotlib.pyplot as plot
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory and Input File
workingDir = "G:/Shared drives/WaDE Data/NorthDakota/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

## Inputs and Dataframe Creation

In [None]:
# Timeseries water use data
fileInput = "Water_Use.xlsx"
df_wu = pd.read_excel(fileInput)
df_wu['Permit_Index'] = df_wu['Permit_Index'].astype('Int64').astype('str')
df_wu['Use_Year'] = df_wu['Use_Year'].astype('Int64').astype('str')
print(len(df_wu))
df_wu.head(1)

In [None]:
# POD data
fileInput = "POD.xlsx"
df_pod = pd.read_excel(fileInput)
print(len(df_pod))
df_pod.head(1)

In [None]:
# Permit_Header data
fileInput = "Permit_Header.xlsx"
df_ph = pd.read_excel(fileInput)
print(len(df_ph))
df_ph.head(1)

In [None]:
# Left-Join data
df = pd.merge(df_wu, df_pod, on='POD_Index', how='left')
df = df.merge(df_ph, left_on='Permit_Index_x', right_on='Permit_Index', how='left')
df = df.replace(np.nan, "").reset_index(drop=True)

print(len(df))
df.head(1)

## Time Series Data
- Exporting Monthly timeseries data.

In [None]:
# Return Data
# Create temporary main dataframe
dfout = pd.DataFrame(index=df.index)

# Variable Info
dfout['in_VariableCV'] = "Withdrawal"
dfout['in_VariableSpecificCV'] = "" # Timeseries specific.

# Water Source Info
dfout['in_WaterSourceTypeCV'] = df['Source']

# Site Info
dfout['in_County'] = df['County']
dfout['in_Latitude'] = df['Latitude']
dfout['in_Longitude'] = df['Longitude']
dfout['in_SiteNativeID'] = df['POD'].astype('str')

# Site Variable Amount Info
dfout['in_Amount'] = df['Reported_AcFt'] # will convert from AcFt to MG
dfout['in_AssociatedNativeAllocationIDs'] = df['Permit_Number'].astype(str)
dfout['in_BeneficialUseCategory'] = df['Use_Type_x']
dfout['in_CommunityWaterSupplySystem'] =  df['Civil_Township']
# dfout['in_CropTypeCV'] = df['Crop_type1']
# dfout['in_IrrigatedAcreage'] = df['Reported_Acres']
# dfout['in_IrrigationMethodCV'] = df['Irrigation_Type']
# dfout['in_PowerGeneratedGWh'] = df['KWHrs']
dfout['in_ReportYearCV'] =  df['Use_Year']
dfout['in_TimeframeStart'] = df['Use_Year'] + "/01/01"
dfout['in_TimeframeEnd'] = df['Use_Year']  + "/12/31"

print(len(dfout))
dfout.head(5)

## WaDE Custom Elements (due to missing info)

In [None]:
# updating in_WaterSourceTypeCV to be more machine readable / WaDE specific
# ----------------------------------------------------------------------------------------------------

def createWaterSourceTypeCV(inWST):
    inWST = str(inWST).strip()
    
    if inWST == "":
        outString = "Unspecified"
    elif inWST == "Ground Water":
        outString = "Groundwater"
    else:
        outString =  inWST
      
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: createWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEND_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

In [None]:
# title format for beneficial use
# ----------------------------------------------------------------------------------------------------

def formatTitle(valA):
    valA = str(valA).strip().title()
    if (valA == "") or (pd.isnull(valA)):
        outString = "Unspecified"
    else:
        outString = valA
      
    return outString

dfout['in_BeneficialUseCategory'] = dfout.apply(lambda row: formatTitle(row['in_BeneficialUseCategory']), axis=1)
dfout['in_BeneficialUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------

def createVariableSpecificCV(inV, inBU, inWST):
    inV = str(inV).strip()
    inBU = str(inBU).strip().title()
    inWST = str(inWST).strip()
    
    outString = inV + "_Annual_" +  inBU + "_" + inWST
    
    return outString

dfout['in_VariableSpecificCV'] = dfout.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                     row['in_BeneficialUseCategory'],
                                                                                     row['in_WaterSourceTypeCV']), axis=1)
dfout['in_VariableSpecificCV'].unique()

## Groupby and Sum
- Issue of multiple withdrawl values from same sites by permit.  Will cheat for now and aggregate all values at the single site using our WaDE ss aggregation rules.

In [None]:
dfout2 = dfout.copy()
print(len(dfout2))
dfout2.head(1)

In [None]:
groupbyList = ['in_SiteNativeID', 'in_VariableSpecificCV', 'in_TimeframeStart', 'in_TimeframeEnd']
dfout2 = dfout2.groupby(groupbyList).agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()
print(len(dfout2))
dfout2.head()

## Cleaning Output
- checking & changing data type & format.

In [None]:
# Convert History Year to YYYY-MM-DD format.

dfout2['in_TimeframeEnd'] = pd.to_datetime(dfout2['in_TimeframeEnd'], errors = 'coerce')
dfout2['in_TimeframeEnd'] = pd.to_datetime(dfout2["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout2['in_TimeframeStart'] = pd.to_datetime(dfout2['in_TimeframeStart'], errors = 'coerce')
dfout2['in_TimeframeStart'] = pd.to_datetime(dfout2["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout2.head(1)

In [None]:
# summing up the comma separated list of Amounts to one value.
def sumAmountsFunc(valA):
    valAList = valA.split(",")
    for x in valAList:
        if x == "" or "," in x:
            outString = x
        else:
            try:
                x = float(x)
                outString += x
            except:
                outString = x
                    
    return outString

dfout2['in_Amount'] = dfout2.apply(lambda row: sumAmountsFunc(row['in_Amount']), axis=1)
dfout2.head(1)

In [None]:
# Converting numbers that are in string to float.

# in_Latitude & in_Longitude
dfout2['in_Latitude'] = pd.to_numeric(dfout2['in_Latitude'], errors='coerce')
dfout2['in_Longitude'] = pd.to_numeric(dfout2['in_Longitude'], errors='coerce')

# in_Amount
dfout2['in_Amount'] = pd.to_numeric(dfout2['in_Amount'], errors='coerce')

# # in_PowerGeneratedGWh
# dfout2['in_PowerGeneratedGWh'] = pd.to_numeric(dfout2['in_PowerGeneratedGWh'], errors='coerce')

#in_ReportYearCV
# having some issues converting this to an int
dfout2['in_ReportYearCV'] = pd.to_numeric(dfout2['in_ReportYearCV'], errors='coerce')
dfout2['in_ReportYearCV'] = dfout2['in_ReportYearCV'].fillna(0).astype('int64')

dfout2.head(1)

In [None]:
# convert Amount AcFt to MG
def convertAmountFunc(valA):
    outVal = valA * 0.28002596920264
    return outVal

dfout2['in_Amount'] = dfout2.apply(lambda row: convertAmountFunc(row['in_Amount']), axis=1)
dfout2.head(1)

## Export Outputfile(s)

In [None]:
dfout2.info()

In [None]:
# Exporting output files.
dfout2.to_csv('P_ndSSMaster.csv', index=False)  # The master output.

#### bonus:
- checking processed sitespecificamounts.csv for duplicate and identifying why

In [None]:
# dftest = pd.read_csv('G:/Shared drives/WaDE Data/NorthDakota/SiteSpecificAmounts/ProcessedInputData/sitespecificamounts.csv')
# print(len(dftest))
# dftest.head()

In [None]:
# print(dftest['VariableSpecificUUID'].unique()

In [None]:
# dfdp = dftest.copy()
# duplicateCheckList = ['OrganizationUUID', 'SiteUUID', 'VariableSpecificUUID', 'BeneficialUseCategory', 'ReportYearCV', 'TimeframeEnd', 'TimeframeStart']
# dfdp = dfdp.drop_duplicates(subset=duplicateCheckList)
# print(len(dfdp))
# dfdp.head()

In [None]:
# #duplicate groupby test
# dfgbt = dftest.copy()
# groupbyList = ['OrganizationUUID', 'SiteUUID', 'VariableSpecificUUID', 'BeneficialUseCategory', 'ReportYearCV', 'TimeframeEnd', 'TimeframeStart']
# dfgbt = dfgbt.groupby(groupbyList).agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()
# print(len(dfgbt))
# dfgbt.head()

In [None]:
# dfgbt.to_excel('duplicate groupby test.xlsx', index=False)

In [None]:
dftest = dfout2.copy()
dftest

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------

def createTestColumn(inV):
    inV = str(inV).strip()
    
    outString = inV + "_test Yo"
    
    return outString

dftest['testColumn'] = dftest.apply(lambda row: createTestColumn(row['in_VariableCV']), axis=1)
dftest['testColumn']