In [1]:
# Needed Libraries / Modules
# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import geoplot as gplt  # for plotting maps and geo-data
import geoplot.crs as gcrs  #used to pull in webdata related to maps and geo-data
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- API retrieval ----
import requests
import json

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Setting work directory
workingDir = os.getcwd()
os.chdir(workingDir)
print(workingDir)

C:\Users\rjame\Documents\WSWC Documents\WaDE-Side-Projects\20230406 Bureau Contractors\Source2_ReclamationDams


## Input Data

In [3]:
# Water rights and Owner Names
fileInput = "rawdata/WaDE_WaterRightAllocationOwners.zip" # zip file
dfao_in = pd.read_csv(fileInput)
print(len(dfao_in))
dfao_in.head(1)

2577440


Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK


In [4]:
# Reclamation Contractor Infor
fileInput = "rawdata/ReclamationDamsPublic.zip"
dfrdp_in = gpd.read_file(fileInput).reset_index()
print(len(dfrdp_in))
dfrdp_in.head(1)

591


Unnamed: 0,index,assetclass,assetname,structureh,facilityna,assettype,associated,operatinge,usbrprojec,areaoffice,doiregion,usbrregion,state,nid_id,constructi,geometry
0,0,Dams,Agate Dam,High Hazard Potential,Agate,High Hazard Potential,Agate,Rogue River Valley Irrigation District,415,CPN-CCAO,9,CPN,OR,OR00422,1966-01-01,POINT Z (-122.77330 42.41530 0.00000)


In [5]:
# Unique values for 'ContractorName'
for x in dfrdp_in['operatinge'].sort_values().unique():
    print(f'"' + str(x) + '",')

"Ainsworth Irrigation District",
"Almena Irrigation District",
"Angostura Irrigation District ",
"Animas-La Plata Operations Maintenance and Replacement Association",
"Arbuckle Master Conservancy District",
"Baker Valley Irrigation District ",
"Belle Fourche Irrigation District",
"Bitter Root Irrigation District ",
"Boise Project Board Of Control",
"Bostwick Park Conservancy District",
"Bridger Valley Water Conservancy District",
"Bureau of Reclamation",
"Burnt River Irrigation District ",
"Cachuma O&M Board",
"California Department Water Resources",
"Canadian River Municipal Water Authority",
"Carbon Water Conservancy District",
"Carlsbad Irrigation District ",
"Casitas Municipal Water District",
"Central Arizona Water Conservation District",
"Central Oklahoma Master Conservancy District",
"Central Oregon Irrigation District",
"Central Utah Water Conservancy District",
"City of Corpus Christi, Texas",
"City of Rapid City, South Dakota",
"City of San Angelo, Texas",
"City of Wichita, K

In [6]:
# Drop 'Bureau of Reclamation' like inputs (we are already tracking those separately)
dfrdp_in = dfrdp_in[dfrdp_in['operatinge'] != "Bureau of Reclamation"].reset_index(drop=True)
print(len(dfrdp_in))                                                         

403


## Task #1 - Unique contractor names from Source

In [None]:
# make copy
dfrdp = dfrdp_in.copy()

In [None]:
# get count (converts to series)
unique_count_series = dfrdp.groupby('state')['operatinge'].nunique()

# create dataframe
df1 = unique_count_series.to_frame()
df1['State'] = df1.index
df1 = df1.reset_index(drop=True).rename(columns={"operatinge": "CountConName"})
print(len(df1))
df1.head()

In [None]:
# export results
df1.to_csv('results/result1.csv', index=False)

In [None]:
# ---- Figure #1 Barplot: Num of Unique Reclamation Contractor Names per State from source data ----
fig = px.bar(df1, x='State', y='CountConName')
fig.update_layout(bargap=0.2,
                  title="Barplot: Number of Unique Reclamation Contractor Names from Source Data",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/result1.png', engine="kaleido")

## Task #2- Unique contractor names from Source that fit into WaDE
- use exact match on a matcing link field
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand

In [None]:
# make copies
dfao = dfao_in.copy()
dfrdp = dfrdp_in.copy()
dfrdp = dfrdp[['index', 'operatinge', 'state']]
print(len(dfao))
print(len(dfrdp))

In [None]:
# Create link field
def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner').reset_index(drop=True)
dfao['LinkFieldA'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkFieldA'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldA']), axis=1)

dfrdp['LinkFieldB'] = dfrdp['operatinge'].astype(str) + dfrdp['state'].astype(str)
dfrdp['LinkFieldB'] = dfrdp.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldB']), axis=1)

In [None]:
df2 = pd.merge(dfrdp, dfao, left_on='LinkFieldB', right_on='LinkFieldA', how='left').reset_index(drop=True)
df2.head(1)

In [None]:
# check success
def checkSuccessFunc(A, B):
    if A == B:
        resultVal = 1
    else:
        resultVal = 0
    return resultVal
df2['success'] = df2.apply(lambda row: checkSuccessFunc(row['LinkFieldA'], row['LinkFieldB']), axis=1)
df2['success'].unique()

In [None]:
# get count (use agg and sum of state and contractor name)
df2 = df2[['state', 'operatinge', 'success']].drop_duplicates().reset_index(drop=True)
df2 = df2.groupby(['state']).agg({'success': 'sum'})
df2['State'] = df2.index
df2 = df2.reset_index(drop=True)
print(len(df2))
df2.head(1)

In [None]:
# export results
df2.to_csv('results/result2.csv', index=False)

In [None]:
# ---- Figure #2 Barplot: Number of Usable Unique Reclamation Contractor Names per from into WaDE  ----
fig = px.bar(df2, x='State', y='success')
fig.update_layout(bargap=0.2,
                  title="Barplot: Number of Usable Unique Reclamation Contractor Names that fit into WaDE",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/result2.png', engine="kaleido")

## Task #3 - Number of WaDE Water Rights connec to contractor names
- use exact match on a matcing link field
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand

In [7]:
# make copies
dfao = dfao_in.copy()
dfrdp = dfrdp_in.copy()
dfrdp = dfrdp[['index', 'operatinge', 'state']]
print(len(dfao))
print(len(dfrdp))

2577440
403


In [8]:
# Create link field
def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner').reset_index(drop=True)
dfao['LinkFieldA'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkFieldA'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldA']), axis=1)

dfrdp['LinkFieldB'] = dfrdp['operatinge'].astype(str) + dfrdp['state'].astype(str)
dfrdp['LinkFieldB'] = dfrdp.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldB']), axis=1)

In [9]:
# Left-join dfao to dfrdp to get # of records from reclmation sucessfully worked
df3 = pd.merge(dfao, dfrdp, left_on='LinkFieldA', right_on='LinkFieldB', how='left').reset_index(drop=True)
print(len(df3))
df3.head()

2762155


Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State,LinkFieldA,index,operatinge,state,LinkFieldB
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK,knutson james wak,,,,
1,AKwr_WR1100001,Bergstrom Daniel J,WaDE Blank,AK,bergstrom daniel jak,,,,
2,AKwr_WR1100002,Britton Charles Wayne,WaDE Blank,AK,britton charles wayneak,,,,
3,AKwr_WR1100003,Gustin Lee,WaDE Blank,AK,gustin leeak,,,,
4,AKwr_WR1100004,Thickstun Andrew Jonathan,WaDE Blank,AK,thickstun andrew jonathanak,,,,


In [10]:
# check success
def checkSuccessFunc(A, B):
    if A == B:
        resultVal = 1
    else:
        resultVal = 0
    return resultVal
df3['success'] = df3.apply(lambda row: checkSuccessFunc(row['LinkFieldA'], row['LinkFieldB']), axis=1)
df3['success'].unique()

array([0, 1], dtype=int64)

In [11]:
df3 = df3.drop_duplicates(subset=['AllocationUUID'])
print(len(df3))

2577440


In [12]:
# get count (use agg and sum of state and contractor name)
df3 = df3[['State', 'operatinge', 'success']].reset_index(drop=True)
df3 = df3.groupby(['State']).agg({'success': 'sum'})
df3['State'] = df3.index
df3 = df3.reset_index(drop=True)
print(len(df3))
df3.head()

18


Unnamed: 0,success,State
0,0,AK
1,46,AZ
2,71,CA
3,0,CO
4,34,ID


In [14]:
# export results
df3.to_csv('results/result3.csv', index=False)

In [15]:
# ---- Figure #3 Barplot: Number of Usable Unique Reclamation Contractor Names per from into WaDE  ----

fig = px.bar(df3, x='State', y='success')
fig.update_layout(bargap=0.2,
                  title="Barplot: Num of Potential Reclamation Contractor Water Right Records per State in WaDE",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/result3.png', engine="kaleido")