In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import geoplot as gplt  # for plotting maps and geo-data
import geoplot.crs as gcrs  #used to pull in webdata related to maps and geo-data
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- API retrieval ----
import requests
import json

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Setting work directory

print(os.getcwd())

C:\Users\rjame\Documents\WSWC Documents\WaDE Side Projects Local\20230406 Bureau Contractors\Source1_WebScraping


## Input Data

In [3]:
# Water rights and Owner Names

fileInput = "data/WaDE_WaterRightAllocationOwners.zip" # zip file
dfao_in = pd.read_csv(fileInput)
print(len(dfao_in))
dfao_in.head(1)

2577440


Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK


In [4]:
fileInput = "data/USBR Contractor_State List.csv"
dfcs_in = pd.read_csv(fileInput, encoding = "ISO-8859-1")
print(len(dfcs_in))
dfcs_in.head(1)

238


Unnamed: 0,ContractorName,State
0,Area Office Manager,AZ


In [5]:
fileInput = "data/USBR Contractor_State List_url Location.csv"
dfcs_in2 = pd.read_csv(fileInput, encoding = "ISO-8859-1")
print(len(dfcs_in2))
dfcs_in2.head(1)

1067


Unnamed: 0,Index,url,ContractorName,Location,State
0,1,https://www.usbr.gov/projects/index.php?id=3,Middle Rio Grande Conservancy District,"Albuquerque, NM 87103",NM


## Attempet #1 - Do exact match on a matcing link field
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand.

Results
- was able to match 846 water right records with WaDE AllocationOwners names to the ReclamationDamsPublic names.

In [6]:
dfao = dfao_in.copy()
dfcs = dfcs_in.copy()

In [7]:
# Clean Owner info. Remove special characters
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

In [8]:
dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner').reset_index(drop=True)
dfao['LinkFieldA'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkFieldA'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldA']), axis=1)
dfao.head(1)

Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State,LinkFieldA
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK,knutson james wak


In [9]:
dfcs['LinkFieldB'] = dfcs['ContractorName'].astype(str) + dfcs['State'].astype(str)
dfcs['LinkFieldB'] = dfcs.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldB']), axis=1)
dfcs.head(1)

Unnamed: 0,ContractorName,State,LinkFieldB
0,Area Office Manager,AZ,area office manageraz


In [10]:
dfout = pd.merge(dfao, dfcs, left_on='LinkFieldA', right_on='LinkFieldB', how='left')
dfout.head(1)

Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State_x,LinkFieldA,ContractorName,State_y,LinkFieldB
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK,knutson james wak,,,


In [11]:
# check success

def checkSuccessFunc(A, B):
    if A == B:
        resultVal = 1
    else:
        resultVal = 0
    return resultVal
dfout['success'] = dfout.apply(lambda row: checkSuccessFunc(row['LinkFieldA'], row['LinkFieldB']), axis=1)
dfout['success'].unique()

array([0, 1], dtype=int64)

In [12]:
# drop records that are not success

dfout = dfout[dfout['success'] == 1].reset_index(drop=True)
print(len(dfout))
dfout.head(1)

846


Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State_x,LinkFieldA,ContractorName,State_y,LinkFieldB,success
0,AZwr_WR200316,Central Arizona Project,Private,AZ,central arizona projectaz,Central Arizona Project,AZ,central arizona projectaz,1


In [13]:
dfout['SourceNum'] = "1 - Web Scraping"
dfout['WebLink'] = "https://www.usbr.gov/projects/"

In [14]:
# Group by AllocationUUID

dfout = dfout.groupby('AllocationUUID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem != ""])).replace(np.nan, "").reset_index()
print(len(dfout))

846


In [15]:
dfout.to_csv('JoinAttempts/Attempt_s1_1.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'JoinAttempts/Attempt_s1_1.csv'

## Figures

In [18]:
dfout.head()

Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State_x,LinkFieldA,ContractorName,State_y,LinkFieldB,success,SourceNum,WebLink
0,AZwr_WR200316,Central Arizona Project,Private,AZ,central arizona projectaz,Central Arizona Project,AZ,central arizona projectaz,1,1 - Web Scraping,https://www.usbr.gov/projects/
1,AZwr_WR202944,Central Arizona Project,Private,AZ,central arizona projectaz,Central Arizona Project,AZ,central arizona projectaz,1,1 - Web Scraping,https://www.usbr.gov/projects/
2,AZwr_WR202945,Central Arizona Project,Private,AZ,central arizona projectaz,Central Arizona Project,AZ,central arizona projectaz,1,1 - Web Scraping,https://www.usbr.gov/projects/
3,AZwr_WR206893,Central Arizona Project,Private,AZ,central arizona projectaz,Central Arizona Project,AZ,central arizona projectaz,1,1 - Web Scraping,https://www.usbr.gov/projects/
4,AZwr_WR206894,Central Arizona Project,Private,AZ,central arizona projectaz,Central Arizona Project,AZ,central arizona projectaz,1,1 - Web Scraping,https://www.usbr.gov/projects/


In [None]:
# ---- Histogram: Num of water rights per state via WatersourceTypeCV ----
print(dfout.State_x.value_counts())

fig = px.histogram(dfout, x="State_x")
fig.update_layout(bargap=0.2,
                  title="Histogram of Reclamation Contractor Water Right Records per State",
                  xaxis_title="State Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/s1_NumPerState.png', engine="kaleido")

In [19]:
nameList = dfout['ContractorName'].sort_values().unique().tolist()
print(nameList)

['Ainsworth Irrigation District', 'Boise Project Board of Control', 'Bridger Valley Water Conservancy District', 'Browns Creek Irrigation District', 'Buford-Trenton Irrigation District', 'California Department of Water Resources', 'Canadian River Municipal Water Authority', 'Carbon Water Conservancy District', 'Carlsbad Irrigation District', 'Casitas Municipal Water District', 'Central Arizona Project', 'Central Irrigation District', 'Central Oregon Irrigation District', 'Central Utah Water Conservancy District', 'Chimney Rock Irrigation District', 'City of Corpus Christi', 'City of Rapid City', 'Coachella Valley Water District', 'Cottonwood Creek Consolidated Irrigation Company', 'Deaver Irrigation District', 'Eden Valley Irrigation and Drainage District', 'El Dorado Irrigation District', 'Emery Water Conservancy District', 'Farmers Irrigation District', 'Frenchman Valley Irrigation District', 'Gering Irrigation District', 'Goshen Irrigation District', 'Greenfields Irrigation District