In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Setting work directory

print(os.getcwd())

C:\Users\rjame\Documents\WSWC Documents\WaDE-Side-Projects\20230406 Bureau Contractors


## Input Data

In [3]:
fileInput = "data/WaDE_AllocationOwners.zip" # zip file
dfao_in = pd.read_csv(fileInput)
print(len(dfao_in))
dfao_in.head(1)

718201


Unnamed: 0,AllocationOwner,State
0,`Kellerman Fred P,AZ


In [4]:
fileInput = "data/USBR Contractor_State List.csv"
dfcs_in = pd.read_csv(fileInput, encoding = "ISO-8859-1")
print(len(dfcs_in))
dfcs_in.head(1)

238


Unnamed: 0,ContractorName,State
0,Ainsworth Irrigation District,NE


## Attempet #1 - Do exact match on a matcing link field
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand.

Results
- was able to only match 63 records

In [5]:
dfao = dfao_in.copy()
dfcs = dfcs_in.copy()

In [6]:
# Clean Owner info. Remove special characters
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

In [7]:
dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner')
dfao = dfao.drop_duplicates().reset_index(drop=True)
dfao['LinkField'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkField'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkField']), axis=1)
dfao.head(1)

Unnamed: 0,AllocationOwner,State,LinkField
0,`Kellerman Fred P,AZ,kellerman fred paz


In [8]:
dfcs['LinkField'] = dfcs['ContractorName'].astype(str) + dfcs['State'].astype(str)
dfcs['LinkField'] = dfcs.apply(lambda row: cleanOwnerDataFunc(row['LinkField']), axis=1)
dfcs.head(1)

Unnamed: 0,ContractorName,State,LinkField
0,Ainsworth Irrigation District,NE,ainsworth irrigation districtne


In [9]:
dfao = pd.merge(dfao, dfcs, left_on='LinkField', right_on='LinkField', how='left')
dfao.head(1)

Unnamed: 0,AllocationOwner,State_x,LinkField,ContractorName,State_y
0,`Kellerman Fred P,AZ,kellerman fred paz,,


In [10]:
dfao.to_csv('JoinAttempts/Attempt_1.csv', index=False)

## Attempet #2 - Do a partial match on a matcing link field, use str contains
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand.

Results
- was able to only match 63 records
- issues with if the AllocationOwner was blank it was labled still with a contractor, which is wrong.

In [11]:
dfao = dfao_in.copy()
dfcs = dfcs_in.copy()

In [12]:
# Clean Owner info. Remove special characters
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

In [13]:
dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner')
dfao = dfao.drop_duplicates().reset_index(drop=True)
dfao['LinkField'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkField'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkField']), axis=1)
dfao.head(1)

Unnamed: 0,AllocationOwner,State,LinkField
0,`Kellerman Fred P,AZ,kellerman fred paz


In [14]:
dfcs['LinkField'] = dfcs['ContractorName'].astype(str) + dfcs['State'].astype(str)
dfcs['LinkField'] = dfcs.apply(lambda row: cleanOwnerDataFunc(row['LinkField']), axis=1)
dfcs.head(1)

Unnamed: 0,ContractorName,State,LinkField
0,Ainsworth Irrigation District,NE,ainsworth irrigation districtne


In [15]:
keywords = dfcs['LinkField'].tolist() # create list of keywords to search for.
pattern = '|'.join(keywords) # create a regular expression pattern from the list
dfmatches = dfao[dfao['LinkField'].str.contains(pattern)] # search for rows where the name contains any of the strings in the list

dfmatches = pd.merge(dfmatches, dfcs, on='LinkField', how='left')
dfmatches.head()

Unnamed: 0,AllocationOwner,State_x,LinkField,ContractorName,State_y
0,Central Arizona Project,AZ,central arizona projectaz,Central Arizona Project,AZ
1,Maricopa Water District,AZ,maricopa water districtaz,Maricopa Water District,AZ
2,Us Bureau Of Reclamation Yuma Area Office,AZ,us bureau of reclamation yuma area officeaz,,
3,CALIFORNIA DEPARTMENT OF WATER RESOURCES,CA,california department of water resourcesca,California Department of Water Resources,CA
4,EL DORADO IRRIGATION DISTRICT,CA,el dorado irrigation districtca,El Dorado Irrigation District,CA


In [16]:
dfmatches.to_csv('JoinAttempts/Attempt_2.csv', index=False)

## Attempet #3 - Do a partial match on a matcing link field, use str contains, but do it both ways
- make a custom link based on name + state
- Left Join results, export, and inpsect by hand.
- lets try doing a match from owner -> contractor, then reverse with contractor -> owner, and combine the results.

Results
- only about 55 confirmed.

In [17]:
dfao = dfao_in.copy()
dfcs = dfcs_in.copy()

In [18]:
# Clean Owner info. Remove special characters
# matches any character that is not an uppercase or lowercase letter, a digit, or a whitespace character. 
# The ^ character inside the square brackets negates the match, so the pattern matches any character that is not in the specified set.

import re

def cleanOwnerDataFunc(Val):
    pattern = r"[^a-zA-Z0-9\s]" 
    Val = re.sub(pattern, "", Val).lower().strip()
    return Val

In [19]:
# Clean Owner info. Remove special characters
import regex as re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.';,`/\)(-]", "", Val).lower().strip()
    return Val

In [20]:
dfao = dfao.assign(AllocationOwner=dfao['AllocationOwner'].str.split(',')).explode('AllocationOwner')
dfao = dfao.drop_duplicates().reset_index(drop=True)
dfao['LinkField'] = dfao['AllocationOwner'].astype(str) + dfao['State'].astype(str)
dfao['LinkField'] = dfao.apply(lambda row: cleanOwnerDataFunc(row['LinkField']), axis=1)
print(len(dfao))
dfao.head(1)

794433


Unnamed: 0,AllocationOwner,State,LinkField
0,`Kellerman Fred P,AZ,kellerman fred paz


In [21]:
dfcs['LinkField'] = dfcs['ContractorName'].astype(str) + dfcs['State'].astype(str)
dfcs['LinkField'] = dfcs.apply(lambda row: cleanOwnerDataFunc(row['LinkField']), axis=1)
print(len(dfcs))
dfcs.head(1)

238


Unnamed: 0,ContractorName,State,LinkField
0,Ainsworth Irrigation District,NE,ainsworth irrigation districtne


In [22]:
# check if contractor name is in AllocationOwner
keywords = dfcs['LinkField'].tolist() # create list of keywords to search for.
pattern = '|'.join(keywords) # create a regular expression pattern from the list
dfmatchesA = dfao[dfao['LinkField'].str.contains(pattern)] # search for rows where the name contains any of the strings in the list

dfmatchesA = pd.merge(dfmatchesA, dfcs, on='LinkField', how='left')
print(len(dfmatchesA))
dfmatchesA.head()

67


Unnamed: 0,AllocationOwner,State_x,LinkField,ContractorName,State_y
0,Central Arizona Project,AZ,central arizona projectaz,Central Arizona Project,AZ
1,Maricopa Water District,AZ,maricopa water districtaz,Maricopa Water District,AZ
2,Us Bureau Of Reclamation Yuma Area Office,AZ,us bureau of reclamation yuma area officeaz,,
3,CALIFORNIA DEPARTMENT OF WATER RESOURCES,CA,california department of water resourcesca,California Department of Water Resources,CA
4,EL DORADO IRRIGATION DISTRICT,CA,el dorado irrigation districtca,El Dorado Irrigation District,CA


In [23]:
# check if AllocationOwner is in contractor name
keywords = dfao['LinkField'].tolist() # create list of keywords to search for.
keywords = list(dict.fromkeys(keywords)) # remove duplicate entries
pattern = '|'.join(keywords) # create a regular expression pattern from the list
dfmatchesB = dfcs[dfcs['LinkField'].str.contains(pattern)] # search for rows where the name contains any of the strings in the list

dfmatchesB = pd.merge(dfmatchesB, dfao, on='LinkField', how='left')
print(len(dfmatchesB))
dfmatchesB.head()

error: bad escape \j at position 1319972

In [None]:
with pd.ExcelWriter('JoinAttempts/Attempt_3.xlsx') as writer:
    dfmatchesA.to_excel(writer, sheet_name='CNinAO')
    dfmatchesB.to_excel(writer, sheet_name='AOinCN')