# Pre-processing California Allocation data for WaDEQA upload.
Date Updated: 07/21/2021
Purpose:  To pre-process the California data into one master file for simple DataFrame creation and extraction

Notes:
- asdf

In [None]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/California/WaterAllocation/RawInputData"
os.chdir(workingDir)

In [None]:
# Input Data & dataframe creation
fileInput1 = "EWRIMS MASTER FLAT FILE DATA DICTIONARY DRAFT 1-17-20.xlsx" # date & beneficial use data
fileInput2 = "POD_Attributes_input.csv" # wr data
fileInput3 = "Points_of_Diversion_20210701_input.csv" # site data

df_emff = pd.read_excel(fileInput1, header=0, sheet_name="ewrims_flat_file", skiprows=0).replace(np.nan, "")
df_poda = pd.read_csv(fileInput2).replace(np.nan, "")
df_point = pd.read_csv(fileInput3).replace(np.nan, "")

In [None]:
# Merging dataframes into one, using left-join.
dfPOD = pd.merge(df_poda, df_point, left_on='CORE_POD_ID', right_on='POD_ID', how='left')
dfPOD = pd.merge(dfPOD, df_emff[['WR_WATER_RIGHT_ID', 'PRIORITY_DATE', 'DIRECT_DIV_SEASON_END', 'DIRECT_DIV_SEASON_START', 'APPLICATION_ACCEPTANCE_DATE', 'USE_CODE']], on='WR_WATER_RIGHT_ID', how='left')
print(len(dfPOD))
dfPOD.head(3)

In [None]:
print("Allocation priority date...")

def formatDateString(inString1):
    inString = str(inString1).strip()
    try:
        if inString == '' or pd.isnull(inString):
            valndf = ''
        else:
            valD = pd.to_datetime(inString) # Also valD = parse(inString) #--datetuil.parser.parse
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d/%Y')
    except:
        valndf = ''
    return valndf

dfPOD['in_AllocationPriorityDate'] = dfPOD.apply(lambda row: formatDateString(row['PRIORITY_DATE']) if str(row['PRIORITY_DATE']) != ''else formatDateString(row['APPLICATION_ACCEPTANCE_DATE']), axis=1)
dfPOD.head(3)

In [None]:
print("Timeframe start and time frame end...")

def formatDateString2(inString1):
    inString = str(inString1).strip()
    try:
        if inString == '' or pd.isnull(inString):
            valndf = ''
        else:            
            valD = pd.to_datetime(inString)
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d')
    except:
        valndf = ''
    return valndf

dfPOD['in_AllocationTimeframeStart'] = dfPOD.apply(lambda row: formatDateString2(row['DIRECT_DIV_SEASON_START']), axis=1)
dfPOD['in_AllocationTimeframeEnd'] = dfPOD.apply(lambda row: formatDateString2(row['DIRECT_DIV_SEASON_END']), axis=1)
dfPOD.head(3)

In [None]:
#Creating WaDE Owner Field.  
#Create from Owner field. If empty, use LastName + FirstName fields.

def retrieveOwner(FN, LN):
    FN = str(FN).strip()
    LN = str(LN).strip()
    outList = FN + " " + LN
    if FN == "" or pd.isnull(FN):
        outList = LN
    if LN == "" or pd.isnull(LN):
        outList = FN
    if (FN == "" and LN == "") or (pd.isnull(FN) and pd.isnull(LN)):
        outList = "Unspecified"
    print(outList)
    return outList

dfPOD['in_WaDEOwner'] = dfPOD.apply(lambda row: retrieveOwner(row['FIRST_NAME'], row['LAST_NAME']), axis=1)
dfPOD.head(3)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECA_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfPOD['SOURCE_NAME']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfPOD['in_WaterSourceNativeID'] = dfPOD.apply(lambda row: retrieveWaterSourceNativeID( row['SOURCE_NAME']), axis=1)
dfPOD.head(3)

## Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfPOD.dtypes)

In [None]:
#Exporting to Finished File
dfPOD.to_csv('P_CaliforniaMaster.csv', index=False)  # The output