In [1]:
# Needed Libraries / Modules
# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import geoplot as gplt  # for plotting maps and geo-data
import geoplot.crs as gcrs  #used to pull in webdata related to maps and geo-data
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- API retrieval ----
import requests
import json

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Setting work directory
print(os.getcwd())

C:\Users\rjame\Documents\WSWC Documents\WaDE-Side-Projects\20230406 Bureau Contractors\Source1_WebScraping


## Input Data

In [3]:
# WaDE Water rights and Owner Names

fileInput = "rawdata/WaDE_WaterRightAllocationOwners.zip" # zip file
dfao_in = pd.read_csv(fileInput)
print(len(dfao_in))
dfao_in.head(1)

2577440


Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK


In [4]:
# USBR Contractor Names

fileInput = "rawdata/USBR Contractor_State List.csv"
dfcs_in = pd.read_csv(fileInput, encoding = "ISO-8859-1")
print(len(dfcs_in))
dfcs_in.head(1)

238


Unnamed: 0,ContractorName,State
0,Area Office Manager,AZ


In [5]:
# Unique values for 'ContractorName'
for x in dfcs_in['ContractorName'].sort_values().unique():
    print(f'"' + str(x) + '",')

"Ainsworth Irrigation District",
"Albuquerque Area Office",
"Almena Irrigation District No 5",
"Angostura Irrigation District",
"Arbuckle Master Conservancy District",
"Arch Hurley Conservancy District",
"Area Office Manager",
"BIA Irrigation Department",
"Baker Valley Irrigation District",
"Beerline Irrigation Canal",
"Belle Fourche Irrigation District",
"Bend Field Office",
"Big Horn Basin Field Office",
"Bitter Root Project Irrigation District",
"Boise Project Board of Control",
"Bostwick Irrigation District in Nebraska",
"Bostwick Park Water Conservancy District",
"Boulder Canyon Operations Office",
"Bridger Valley Water Conservancy District",
"Browns Creek Irrigation District",
"Buffalo Rapids Irrigation District No. I",
"Buffalo Rapids Irrigation District No. II",
"Buford-Trenton Irrigation District",
"Bureau of Reclamation",
"Bureau of Reclamation, Four Corners Construction Office",
"Burnt River Irrigation District",
"Cachuma Operations and Maintenance Board",
"California Depart

In [6]:
# Drop 'Bureau of Reclamation' like inputs (we are already tracking those separately)

dropList = ["Bureau of Reclamation", "Bureau of Reclamation, Four Corners Construction Office"]
dfcs_in = dfcs_in[~dfcs_in.ContractorName.isin(dropList)].reset_index(drop=True)
print(len(dfcs_in))
dfcs_in.head(1)

236


Unnamed: 0,ContractorName,State
0,Area Office Manager,AZ


## Task #1 - Unique contractor names from Source

In [None]:
# make copy
dfcs = dfcs_in.copy()

In [None]:
# get count (converts to series)
unique_count_series = dfcs.groupby('State')['ContractorName'].nunique()

# create dataframe
df1 = unique_count_series.to_frame()
df1['State'] = df1.index
df1 = df1.reset_index(drop=True).rename(columns={"ContractorName": "CountConName"})
print(len(df1))
df1.head()

In [None]:
# export results
df1.to_csv('results/result1.csv', index=False)

In [None]:
# ---- Figure #1 Barplot: Num of Unique Reclamation Contractor Names per State from source data ----

fig = px.bar(df1, x='State', y='CountConName')
fig.update_layout(bargap=0.2,
                  title="Barplot: Number of Unique Reclamation Contractor Names from Source Data",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/result1.png', engine="kaleido")

## Task #2- Unique contractor names from Source that fit into WaDE
- use exact match on a matcing link field
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand

In [None]:
# make copies

dfao = dfao_in.copy()
dfcs = dfcs_in.copy()

In [None]:
# Clean Owner info. Remove special characters
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

In [None]:
dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner').reset_index(drop=True)
dfao['LinkFieldA'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkFieldA'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldA']), axis=1)
dfao.head(1)

In [None]:
dfcs['LinkFieldB'] = dfcs['ContractorName'].astype(str) + dfcs['State'].astype(str)
dfcs['LinkFieldB'] = dfcs.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldB']), axis=1)
dfcs.head(1)

In [None]:
df2 = pd.merge(dfcs, dfao, left_on='LinkFieldB', right_on='LinkFieldA', how='left').reset_index(drop=True)
df2.head(1)

In [None]:
# check success

def checkSuccessFunc(A, B):
    if A == B:
        resultVal = 1
    else:
        resultVal = 0
    return resultVal
df2['success'] = df2.apply(lambda row: checkSuccessFunc(row['LinkFieldA'], row['LinkFieldB']), axis=1)
df2['success'].unique()

In [None]:
# get count (use agg and sum of state and contractor name)
df2 = df2[['State_x', 'ContractorName', 'success']].drop_duplicates().reset_index(drop=True)
df2 = df2.groupby(['State_x']).agg({'success': 'sum'})
df2['State'] = df2.index
df2 = df2.reset_index(drop=True)
print(len(df2))
df2.head(1)

In [None]:
# export results
df2.to_csv('results/result2.csv', index=False)

In [None]:
# ---- Figure #2 Barplot: Number of Usable Unique Reclamation Contractor Names per from into WaDE  ----

fig = px.bar(df2, x='State', y='success')
fig.update_layout(bargap=0.2,
                  title="Barplot: Number of Usable Unique Reclamation Contractor Names that fit into WaDE",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/result2.png', engine="kaleido")

## Task #3 - Number of WaDE Water Rights connec to contractor names
- use exact match on a matcing link field
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand

In [7]:
# make copies
dfao = dfao_in.copy()
dfcs = dfcs_in.copy()

In [8]:
# Clean Owner info. Remove special characters
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

In [9]:
dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner').reset_index(drop=True)
dfao['LinkFieldA'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkFieldA'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldA']), axis=1)
dfao.head(1)

Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State,LinkFieldA
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK,knutson james wak


In [10]:
dfcs['LinkFieldB'] = dfcs['ContractorName'].astype(str) + dfcs['State'].astype(str)
dfcs['LinkFieldB'] = dfcs.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldB']), axis=1)
dfcs.head(1)

Unnamed: 0,ContractorName,State,LinkFieldB
0,Area Office Manager,AZ,area office manageraz


In [11]:
df3 = pd.merge(dfao, dfcs, left_on='LinkFieldA', right_on='LinkFieldB', how='left').reset_index(drop=True)
print(len(df3))
df3.head(1)

Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State_x,LinkFieldA,ContractorName,State_y,LinkFieldB
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK,knutson james wak,,,


In [12]:
# check success
def checkSuccessFunc(A, B):
    if A == B:
        resultVal = 1
    else:
        resultVal = 0
    return resultVal
df3['success'] = df3.apply(lambda row: checkSuccessFunc(row['LinkFieldA'], row['LinkFieldB']), axis=1)
df3['success'].unique()

array([0, 1], dtype=int64)

In [14]:
df3 = df3.drop_duplicates(subset=['AllocationUUID'])
print(len(df3))

2577440


In [16]:
# get count (use agg and sum of state and contractor name)
df3 = df3[['State_x', 'ContractorName', 'success']].reset_index(drop=True)
df3 = df3.groupby(['State_x']).agg({'success': 'sum'})
df3['State'] = df3.index
df3 = df3.reset_index(drop=True)
print(len(df3))
df3.head()

18


Unnamed: 0,success,State
0,0,AK
1,109,AZ
2,167,CA
3,0,CO
4,38,ID


In [18]:
# export results
df3.to_csv('results/result3.csv', index=False)

In [19]:
# ---- Figure #3 Barplot: Number of Usable Unique Reclamation Contractor Names per from into WaDE  ----

fig = px.bar(df3, x='State', y='success')
fig.update_layout(bargap=0.2,
                  title="Barplot: Num of Potential Reclamation Contractor Water Right Records per State in WaDE",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/result3.png', engine="kaleido")