# Preprocessing New Jersey Specific data for WaDEQA upload.
- Date Updated: 05/13/2022

Notes:
- Working with esimated return and withdrawal monthly timeseres data in MG per site, with multiple sites for each municipial area(s).
- We will aggregate sites to the area.
- Available data...
    - return_MunDischarge.csv & withd_MunWithdrawal.csv has the ts amount info
    - return_MunInfo.csv & withd_MunInfo has the keys needed to attach shapefile site info to the ts amount info.
    - Municipal_Boundaries_of_NJ.csv contains the shapefile site info to attach to the ts amount info.
    - return_MunSiteInfo.csv & withd_MunSiteInfo.csv have a few water source type info.
- Match ts amount data to info data via MCDCode field -> shapefile site info via GNISCode field. 
- Match siteinfo to the ts amount via SiteName field.

In [None]:
# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles

# visulizaiton
import matplotlib.pyplot as plot
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory and Input File
workingDir = "G:/Shared drives/WaDE Data/NewJersey/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

## Inputs and Dataframe Creation

In [None]:
# 1a) return_MunDischarge.xlss
fileInput = "return_MunDischarge.xlsx"
dfr_md = pd.read_excel(fileInput).replace(np.nan, "")
print(len(dfr_md))
dfr_md.head(1)

In [None]:
# 1b) return_MunInfo.xlss
fileInput = "return_MunInfo.xlsx"
dfr_mi = pd.read_excel(fileInput).replace(np.nan, "")
print(len(dfr_mi))
dfr_mi.head(1)

In [None]:
# 1c) return_MunSiteInfo.xlss
fileInput = "return_MunSiteInfo.xlsx"
dfr_msi = pd.read_excel(fileInput).replace(np.nan, "")
print(len(dfr_msi))
dfr_msi.head(1)

In [None]:
# 2a) withd_MunWithdrawal.xlss
fileInput = "withd_MunWithdrawal.xlsx"
dfw_mw = pd.read_excel(fileInput).replace(np.nan, "")
print(len(dfw_mw))
dfw_mw.head(1)

In [None]:
# 2b) withd_MunInfo.xlss
fileInput = "withd_MunInfo.xlsx"
dfw_mi = pd.read_excel(fileInput).replace(np.nan, "")
print(len(dfw_mi))
dfw_mi.head(1)

In [None]:
# 2c) withd_MunSiteInfo.xlss
fileInput = "withd_MunSiteInfo.xlsx"
dfw_msi = pd.read_excel(fileInput).replace(np.nan, "")
print(len(dfw_msi))
dfw_msi.head(1)

In [None]:
# 3) Municipal_Boundaries_of_NJ.csv
fileInput = "Municipal_Boundaries_of_NJ.csv"
df_mb = pd.read_csv(fileInput).replace(np.nan, "")
print(len(df_mb))
df_mb.head(1)

In [None]:
# Left-Join return data
dfr = pd.merge(dfr_md, dfr_mi, on='MCDCode', how='left')
dfr = pd.merge(dfr, df_mb, left_on='GNISCode', right_on='GNIS', how='left')
dfr = pd.merge(dfr, dfr_msi, on='SiteName', how='left')

print(len(dfr))
dfr.head(3)

In [None]:
# Left-Join withdrawal data
dfw = pd.merge(dfw_mw, dfw_mi, on='MCDCode', how='left')
dfw = pd.merge(dfw, df_mb, left_on='GNISCode', right_on='GNIS', how='left')
dfw = pd.merge(dfw, dfw_msi, on='SiteName', how='left')

print(len(dfw))
dfw.head(3)

## Time Series Data
- Exporting Monthly timeseries data.

In [None]:
# Return Data
# Create temporary main dataframe
dfr_temp = pd.DataFrame(index=dfr.index)

# Variable Info
dfr_temp['in_VariableCV'] = "Return"
dfr_temp['in_VariableSpecificCV'] = "" # Timeseries specific.

# Water Source Info
dfr_temp['in_WaterSourceTypeCV'] = dfr['GWorSW']

# Site Info
dfr_temp['in_County'] = dfr['COUNTY']
dfr_temp['in_GNISCodeCV'] = dfr['GNIS']
dfr_temp['in_Latitude'] = dfr['Lat'].astype(float)
dfr_temp['in_Longitude'] = dfr['Long'].astype(float)
dfr_temp['in_SiteName'] = dfr['NAME']
dfr_temp['in_SiteNativeID'] = dfr['GNIS'].astype('Int64').astype('str')
dfr_temp['in_SiteTypeCV'] = dfr['MUN_TYPE'].astype(str)

# Site Variable Amount Info
dfr_temp['in_Amount'] = dfr['ReturnMG'].astype(float)
dfr_temp['in_AssociatedNativeAllocationIDs'] = dfr['PermitNumber'].astype(str)
dfr_temp['in_BeneficialUseCategory'] = dfr['UseGroup']
dfr_temp['in_CommunityWaterSupplySystem'] =  dfr['GNIS_NAME']
dfr_temp['in_PopulationServed'] =  dfr['POP2010']
dfr_temp['in_ReportYearCV'] =  dfr['YearNumber']

dfr_temp['in_TimeframeStart'] = dfr['YearNumber'].astype(str) + "/" + dfr['MonthNumber'].astype(str) + "/01" 
dfr_temp['in_TimeframeEnd'] = dfr['YearNumber'].astype(str) + "/" + dfr['MonthNumber'].astype(str) + "/28" #cheat for now and use value of 28 for day

print(len(dfr_temp))
dfr_temp.head(1)

In [None]:
# Withdrawal Data
# Create temporary main dataframe
dfw_temp = pd.DataFrame(index=dfw.index)

# Variable Info
dfw_temp['in_VariableCV'] = "Withdrawal"
dfw_temp['in_VariableSpecificCV'] = "" # Timeseries specific.

# Water Source Info
dfw_temp['in_WaterSourceTypeCV'] = dfw['GWorSW']

# Site Info
dfw_temp['in_County'] = dfw['COUNTY']
dfw_temp['in_GNISCodeCV'] = dfw['GNIS']
dfw_temp['in_Latitude'] = dfw['Lat'].astype(float)
dfw_temp['in_Longitude'] = dfw['Long'].astype(float)
dfw_temp['in_SiteName'] = dfw['NAME']
dfw_temp['in_SiteNativeID'] = dfw['GNIS'].astype('Int64').astype('str')
dfw_temp['in_SiteTypeCV'] = dfw['MUN_TYPE'].astype(str)

# Site Variable Amount Info
dfw_temp['in_Amount'] = dfw['WithdrawalMG'].astype(float)
dfw_temp['in_AssociatedNativeAllocationIDs'] = dfw['PermitNumber'].astype(str)
dfw_temp['in_BeneficialUseCategory'] = dfw['UseGroup']
dfw_temp['in_CommunityWaterSupplySystem'] =  dfw['GNIS_NAME']
dfw_temp['in_PopulationServed'] =  dfw['POP2010']
dfw_temp['in_ReportYearCV'] =  dfw['YearNumber']

dfw_temp['in_TimeframeStart'] = dfw['YearNumber'].astype(str) + "/" + dfw['MonthNumber'].astype(str) + "/01"
dfw_temp['in_TimeframeEnd'] = dfw['YearNumber'].astype(str) + "/" + dfw['MonthNumber'].astype(str) + "/28" #cheat for now and use value of 28 for day

print(len(dfw_temp))
dfw_temp.head(1)

In [None]:
# Concatenate return data with withdrawal data
frames = [dfr_temp, dfw_temp]
dfout = pd.concat(frames).reset_index(drop=True)
print(len(dfout))
dfout.head(1)

## WaDE Custom Elements (due to missing info)

In [None]:
# updating in_WaterSourceTypeCV to be more machine readable / WaDE specific
# ----------------------------------------------------------------------------------------------------

def createWaterSourceTypeCV(inWST):
    inWST = str(inWST).strip()
    
    if inWST == "":
        outString = "Unspecified"
    if inWST == "un":
        outString = "Unspecified"
    if inWST == "SW":
        outString = "Surface Water"
    if inWST == "GW":
        outString = "Groundwater"
    if inWST == "SG":
        outString = "Surface Groundwater"      
      
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: createWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDENJ_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------

def createVariableSpecificCV(inV, inBU, inWST):
    inV = str(inV).strip()
    inBU = str(inBU).strip().title()
    inWST = str(inWST).strip()
    
    outString = inV + "_Monthly_" +  inBU + "_" + inWST
    
    return outString

dfout['in_VariableSpecificCV'] = dfout.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                     row['in_BeneficialUseCategory'],
                                                                                     row['in_WaterSourceTypeCV']), axis=1)
dfout['in_VariableSpecificCV'].unique()

## Groupby and Sum

In [None]:
dfout2 = dfout.copy()
print(len(dfout2))
dfout2.head(1)

In [None]:
# groupbyList = ['in_SiteNativeID', 'in_VariableSpecificCV', 'in_TimeframeStart', 'in_TimeframeEnd']
# dfout2 = dfout2.groupby(groupbyList).sum(numeric_only=False).reset_index()
# print(len(dfout2))
# dfout2.head()

groupbyList = ['in_SiteNativeID', 'in_VariableSpecificCV', 'in_TimeframeStart', 'in_TimeframeEnd']
dfout2 = dfout2.groupby(groupbyList).agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()
print(len(dfout2))
dfout2.head()

## Cleaning Output

In [None]:
# Convert History Year to YYYY-MM-DD format.

dfout2['in_TimeframeEnd'] = pd.to_datetime(dfout2['in_TimeframeEnd'], errors = 'coerce')
dfout2['in_TimeframeEnd'] = pd.to_datetime(dfout2["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout2['in_TimeframeStart'] = pd.to_datetime(dfout2['in_TimeframeStart'], errors = 'coerce')
dfout2['in_TimeframeStart'] = pd.to_datetime(dfout2["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout2.head(1)

In [None]:
# summing up the comma separated list of Amounts to one value.
def sumAmountsFunc(valA):
    valAList = valA.split(",")
    for x in valAList:
        if x == "" or "," in x:
            outString = x
        else:
            try:
                x = float(x)
                outString += x
            except:
                outString = x
                    
    return outString

dfout2['in_Amount'] = dfout2.apply(lambda row: sumAmountsFunc(row['in_Amount']), axis=1)
dfout2.head(1)

In [None]:
# Converting numbers that are in string to float.

# in_Latitude & in_Longitude
dfout2['in_Latitude'] = pd.to_numeric(dfout2['in_Latitude'], errors='coerce')
dfout2['in_Longitude'] = pd.to_numeric(dfout2['in_Longitude'], errors='coerce')

# in_Amount
dfout2['in_Amount'] = pd.to_numeric(dfout2['in_Amount'], errors='coerce')

#in_PopulationServed
dfout2['in_PopulationServed'] = pd.to_numeric(dfout2['in_PopulationServed'], errors='coerce').fillna(0)
dfout2['in_PopulationServed'] = dfout2['in_PopulationServed'].astype(int)

#in_ReportYearCV
dfout2['in_ReportYearCV'] = pd.to_numeric(dfout2['in_ReportYearCV'], errors='coerce')

dfout2.head(1)

In [None]:
# title format for beneficial use
# ----------------------------------------------------------------------------------------------------

def formatTitle(A):
    if (A == "") or (pd.isnull(A)):
        outString = "Unspecified"
    else:
        outString = str(A).strip().title()
      
    return outString

dfout2['in_BeneficialUseCategory'] = dfout2.apply(lambda row: formatTitle(row['in_BeneficialUseCategory']), axis=1)
dfout2['in_BeneficialUseCategory'].unique()

## Shapefile Data
- For attaching geometry to POU areas and sites.

In [None]:
# PoU Shapefile Data
# Shapefile input
ShapeFileInput = gpd.read_file('shapefiles/Municipal_Boundaries_of_NJ.shp')
dfPoUshapetemp = pd.DataFrame(ShapeFileInput)
dfPoUshapetemp.head(3)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = dfPoUshapetemp['GNIS'].astype('Int64').astype('str')
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

## Export Outputfile(s)

In [None]:
dfout2.info()

In [None]:
# Exporting output files.
dfout2.to_csv('P_njSSMaster.csv', index=False)  # The master output.
dfPoUshape.to_csv('P_njSSGeometry.csv', index=False) # The output geometry.

#### bonus:

In [None]:
# # Exporting output files.
# dfw.to_excel('withdrawalCombine.xlsx', index=False)

In [None]:
# print(len(dfw))

In [None]:
# dfw2 = dfw[dfw.duplicated()]
# print(len(dfw2))

In [None]:
# # Exporting output files.
# dfw2.to_excel('dfw2_withdrawalCombine.xlsx', index=False)

In [None]:
# dfout2 = dfout[dfout.duplicated()]
# print(len(dfout2))

In [None]:
# # Exporting output files.
# dfout2.to_excel('dfout2_withdrawalCombine.xlsx', index=False)

In [None]:
dfout3 = dfout2.copy()
dfout3 = dfout3[dfout3['in_SiteNativeID'] == 'nan']
print(len(dfout3))