In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import geoplot as gplt  # for plotting maps and geo-data
import geoplot.crs as gcrs  #used to pull in webdata related to maps and geo-data
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- API retrieval ----
import requests
import json

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Setting work directory

workingDir = os.getcwd()
os.chdir(workingDir)
print(workingDir)

C:\Users\rjame\Documents\WSWC Documents\WaDE Side Projects Local\20230406 Bureau Contractors\Source2_ReclamationDams


## Input Data

In [3]:
# Water rights and Owner Names

fileInput = "data/WaDE_WaterRightAllocationOwners.zip" # zip file
dfao_in = pd.read_csv(fileInput)
print(len(dfao_in))
dfao_in.head(1)

2577440


Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State
0,AKwr_WR1100000,Knutson James W,WaDE Blank,AK


In [4]:
# Reclamation Contractor Infor

fileInput = "data/ReclamationDamsPublic.zip"
dfrdp_in = gpd.read_file(fileInput).reset_index()
print(len(dfrdp_in))
dfrdp_in.head()

591


Unnamed: 0,index,assetclass,assetname,structureh,facilityna,assettype,associated,operatinge,usbrprojec,areaoffice,doiregion,usbrregion,state,nid_id,constructi,geometry
0,0,Dams,Agate Dam,High Hazard Potential,Agate,High Hazard Potential,Agate,Rogue River Valley Irrigation District,415,CPN-CCAO,9,CPN,OR,OR00422,1966-01-01,POINT Z (-122.77330 42.41530 0.00000)
1,1,Dams,Agency Valley Dam,High Hazard Potential,Agency Valley,High Hazard Potential,Beulah,Vale Oregon Irrigation District,126,CPN-SRAO,9,CPN,OR,OR00589,1935-01-01,POINT Z (-118.15670 43.91130 0.00000)
2,2,Dams,Alcova Dam,High Hazard Potential,Alcova,High Hazard Potential,Alcova,Bureau of Reclamation,144,MB/ART-WYAO,7,MB/ART,WY,WY01290,1938-02-08,POINT Z (-106.72000 42.54790 0.00000)
3,3,Dams,Alfalfa Ditch Diversion Dam,Low Hazard Potential,Alfalfa Ditch Diversion,Low Hazard Potential,Alfalfa Ditch Diversion Dam Pool,Orchard City Irrigation District,270,UCB-WCAO,7,UCB,CO,,,POINT Z (-107.92362 38.89128 0.00000)
4,4,Dams,Almena Diversion,Low Hazard Potential,Almena Diversion,Low Hazard Potential,Almena Diversion Pool,Almena Irrigation District,492,MB/ART-NKAO,5,MB/ART,KS,,1967-01-01,POINT Z (-99.74774 39.86620 0.00000)


## Attempet #1 - Do exact match on a matcing link field
- make a custom link based on name + state
- left-join results, export, and inpsect by hand.

Results
- was able to match 649 water right records with WaDE AllocationOwners names to the ReclamationDamsPublic names.
- we were able to use 241 records from the 591 reclamation.

In [5]:
# make copies

dfao = dfao_in.copy()
dfrdp = dfrdp_in.copy()
dfrdp = dfrdp[['index', 'operatinge', 'state']]
print(len(dfao))
print(len(dfrdp))

2577440
591


In [6]:
# Create link field

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner').reset_index(drop=True)
dfao['LinkFieldA'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkFieldA'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldA']), axis=1)

dfrdp['LinkFieldB'] = dfrdp['operatinge'].astype(str) + dfrdp['state'].astype(str)
dfrdp['LinkFieldB'] = dfrdp.apply(lambda row: cleanOwnerDataFunc(row['LinkFieldB']), axis=1)

In [7]:
# Left-join dfrdp to dfao to get # of water rights we were able to pair
# Left-join dfao to dfrdp to get # of records from reclmation sucessfully worked

dfoutao = pd.merge(dfao, dfrdp, left_on='LinkFieldA', right_on='LinkFieldB', how='left')
dfoutrdp = pd.merge(dfrdp, dfao, left_on='LinkFieldB', right_on='LinkFieldA', how='left')
print(len(dfoutao))
print(len(dfoutrdp))

2763056
4706


In [8]:
# check success

def checkSuccessFunc(A, B):
    if A == B:
        resultVal = 1
    else:
        resultVal = 0
    return resultVal
dfoutao['success'] = dfoutao.apply(lambda row: checkSuccessFunc(row['LinkFieldA'], row['LinkFieldB']), axis=1)
dfoutrdp['success'] = dfoutrdp.apply(lambda row: checkSuccessFunc(row['LinkFieldB'], row['LinkFieldA']), axis=1)

In [9]:
# drop records that are not success
# only want to look at dropped water rights here in dfoutao

dfoutao = dfoutao[dfoutao['success'] == 1].reset_index(drop=True)
print(len(dfoutao))

4356


In [10]:
# Add source #
dfoutao['SourceNum'] = "2 - ReclamationDams"
dfoutao['WebLink'] = "https://www.arcgis.com/apps/mapviewer/index.html?layers=1df76b35789f448094cec79c53c42555"
dfoutrdp['dfoutrdp'] = "2 - ReclamationDams"
dfoutrdp['WebLink'] = "https://www.arcgis.com/apps/mapviewer/index.html?layers=1df76b35789f448094cec79c53c42555"

In [11]:
# Group by AllocationUUID

dfoutao = dfoutao.groupby('AllocationUUID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem != ""])).replace(np.nan, "").reset_index()
dfoutrdp = dfoutrdp.groupby('index').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem != ""])).replace(np.nan, "").reset_index()
print(len(dfoutao))
print(len(dfoutrdp))

649
591


In [None]:
# export results

with pd.ExcelWriter('JoinAttempts/Attempt_s2_1.xlsx') as writer:
    dfoutao.to_excel(writer, sheet_name='dfoutao')
    dfoutrdp.to_excel(writer, sheet_name='dfoutrdp')

## Figures

In [12]:
dfoutao.head()

Unnamed: 0,AllocationUUID,AllocationOwner,OwnerClassificationCV,State,LinkFieldA,index,operatinge,state,LinkFieldB,success,SourceNum,WebLink
0,AZwr_WR208776,Bureau Of Reclamation,Bureau Reclamation (USBR),AZ,bureau of reclamationaz,199.0,Bureau of Reclamation,AZ,bureau of reclamationaz,1,2 - ReclamationDams,https://www.arcgis.com/apps/mapviewer/index.ht...
1,AZwr_WR208777,Bureau Of Reclamation,Bureau Reclamation (USBR),AZ,bureau of reclamationaz,199.0,Bureau of Reclamation,AZ,bureau of reclamationaz,1,2 - ReclamationDams,https://www.arcgis.com/apps/mapviewer/index.ht...
2,AZwr_WR218349,Bureau Of Reclamation,Bureau Reclamation (USBR),AZ,bureau of reclamationaz,199.0,Bureau of Reclamation,AZ,bureau of reclamationaz,1,2 - ReclamationDams,https://www.arcgis.com/apps/mapviewer/index.ht...
3,AZwr_WR218350,Bureau Of Reclamation,Bureau Reclamation (USBR),AZ,bureau of reclamationaz,199.0,Bureau of Reclamation,AZ,bureau of reclamationaz,1,2 - ReclamationDams,https://www.arcgis.com/apps/mapviewer/index.ht...
4,AZwr_WR218351,Bureau Of Reclamation,Bureau Reclamation (USBR),AZ,bureau of reclamationaz,199.0,Bureau of Reclamation,AZ,bureau of reclamationaz,1,2 - ReclamationDams,https://www.arcgis.com/apps/mapviewer/index.ht...


In [15]:
# ---- Histogram: Num of water rights per state via WatersourceTypeCV ----
print(dfoutao.State.value_counts())

fig = px.histogram(dfoutao, x="State")
fig.update_layout(bargap=0.2,
                  title="Histogram of Reclamation Contractor Water Right Records per State",
                  xaxis_title="State Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/s2_NumPerState.png', engine="kaleido")

UT    173
NE    135
AZ    102
CA     71
ID     43
TX     42
WY     29
NM     21
WA     13
NV     10
MT      5
OR      5
Name: State, dtype: int64
