# Pre-processing Oklahoma Allocation data for WaDEQA upload.
Date Updated: 04/07/2020
Purpose:  To pre-process the Oklahoma data into one master file for simple DataFrame creation and extraction.  To validate datatypes and other data related informattion.

In [None]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Oklahoma/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Division Data

In [None]:
# Input Files
PGW_Input = "Permitted_Groundwater_Wells_input.csv"
PSWDP_Input = "Permitted_Surface_Water_Diversion_Points_input.csv"

# Dataframe creation
# df_PGW = pd.read_csv(PGW_Input, encoding = "ISO-8859-1")
# df_PSWDP = pd.read_csv(PSWDP_Input, encoding = "ISO-8859-1")
df_PGW = pd.read_csv(PGW_Input)
df_PSWDP = pd.read_csv(PSWDP_Input)

# Concatenate
# Both datasets share the same columns.
dfPOD = pd.concat([df_PGW, df_PSWDP], ignore_index=True).reset_index(drop=True)

print(len(dfPOD))
dfPOD.head(1)

In [None]:
# PODorPOUSite insert
dfPOD['in_PODorPOUSite'] = "POD"
dfPOD.head(3)

In [None]:
#Changing datatype of used date fields. 
dfPOD['DATE_FILED'] = pd.to_datetime(dfPOD['DATE_FILED'], errors = 'coerce')
dfPOD['DATE_FILED'] = pd.to_datetime(dfPOD['DATE_FILED'].dt.strftime('%m/%d/%Y'))

dfPOD['DATE_ISSUED'] = pd.to_datetime(dfPOD['DATE_ISSUED'], errors = 'coerce')
dfPOD['DATE_ISSUED'] = pd.to_datetime(dfPOD['DATE_ISSUED'].dt.strftime('%m/%d/%Y'))

## Area of Use Data

In [None]:
# Input Files
AOU_Input = "OK_AreasofUse_input.csv"

# Dataframe creation
# dfPOU = pd.read_csv(AOU_Input, encoding = "ISO-8859-1")
dfPOU = pd.read_csv(AOU_Input)

print(len(dfPOU))
dfPOU.head(3)

In [None]:
# PODorPOUSite insert
dfPOU['in_PODorPOUSite'] = "POU"
dfPOU.head(3)

In [None]:
#Changing datatype of used date fields. 
dfPOU['DATE_FILED'] = pd.to_datetime(dfPOU['DATE_FILED'], errors = 'coerce')
dfPOU['DATE_FILED'] = pd.to_datetime(dfPOU['DATE_FILED'].dt.strftime('%m/%d/%Y'))

dfPOU['DATE_ISSUED'] = pd.to_datetime(dfPOU['DATE_ISSUED'], errors = 'coerce')
dfPOU['DATE_ISSUED'] = pd.to_datetime(dfPOU['DATE_ISSUED'].dt.strftime('%m/%d/%Y'))

## Concaenate POD and POU

In [None]:
# Concatenate
# Both datasets share the same columns.
df = pd.concat([dfPOD, dfPOU], ignore_index=True).reset_index(drop=True)

print(len(df))
df

## Data Fix

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

In [None]:
#Fixing Beneficial Uses PRIMARY_PURPOSE
def fixRecFishWild(colrowValue):
    if colrowValue == 'Recreation, Fish, Wildlife':
        outList = 'Recreation Fish Wildlife'
    else:
        outList = colrowValue
    return outList

df['PRIMARY_PURPOSE'] = df.apply(lambda row: fixRecFishWild(row['PRIMARY_PURPOSE']), axis=1)

In [None]:
# swapping order owner name

def createOwnerName(val):
    if val == "" or pd.isnull(val):
        outString = ""
    else:
        val = str(val)
        val = val.strip()
        if "," in val:
            x = val.split(",")
            outString = str(x[0]).strip() + " " + str(x[1]).strip()
        else:
            outString = val
            
    return outString

df['in_AllocationOwner'] = df.apply(lambda row: createOwnerName(row['ENTITY_NAME']), axis=1)
df

## WaDE Custom Elements (due to missing state site info)

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOK_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = df['LATITUDE']
dfSiteNativeID['in_Longitude'] = df['LONGITUDE']
dfSiteNativeID['in_PODorPOUSite'] = df['in_PODorPOUSite']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B, C):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B) &
                                (dfSiteNativeID['in_PODorPOUSite'] == C), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['in_SiteNativeID'] = df.apply(lambda row: retrieveSiteNativeID( row['LATITUDE'], row['LONGITUDE'], row['in_PODorPOUSite']), axis=1)
df

In [None]:
#Exporting to Finished File
df.to_csv('P_OklahomaMaster.csv', index=False)  # The output