In [None]:
%load_ext autotime
import pandas as pd
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
#import matplotlib.pyplot as plt
#import plotly_express as px
#from tqdm import tqdm
#from tqdm.tqdm_notebook import tqdm_notebook

In [None]:
# Import csv file to pandas dataframe and preview
df = pd.read_csv("clean_chemical_master_gwq_data.csv")
df.head()

In [None]:
# Because there is a quota limit associated with geocoding, 
# I am going to delete all rows with duplicate "well_id".
# After I get my county data, I will add it back to complete dataset.
# Check number of rows in dataframe
len(df)

In [None]:
# Convert well_id data type to string
df['well_id'] = df['well_id'].astype(str)
# Convert latitude and longitude data type to float
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

In [None]:
# Make new dataframe dropping duplicte values 
df_dup = df.drop_duplicates(subset='well_id', keep='first')

# sorting by "well_id" 
#df.sort_values("well_id", inplace = True) 
  
# displaying data 
df_dup.head()

In [None]:
# Check number of rows in dataframe
len(df_dup)

In [None]:
# Reverse geocode one record of lat/lon data to test code and determine county data
geolocator = Nominatim(user_agent="Cal_gwq_draft.py", timeout=3) # 3 second delay to timeout

In [None]:
location = geolocator.reverse("34.624472, -118.185056")

In [None]:
geo_string = location.address.split(",")
print(geo_string)

In [None]:
for i in geo_string:
    if 'County' in i:
        print(i)

In [None]:
# Create county column and add county data to dataframe
county = []
for i in range(len(df)):
    # obtain lat and lon for each row in data frame
    lat = df_dup.latitude.iloc[i]
    lon = df_dup.longitude.iloc[i]
    # use rate limiter to avoid timeout error
    reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1) # should take about 1 hour
    
    # get the address data for dataframe using lat/lon
    location = geolocator.reverse(f"{lat}, {lon}")
    # split address data into columns
    geo_string = location.address.split(",")
    # extract county data from address data and add "county_column" to dataframe
    for i in geo_string:
        if 'County' in i:
            county.append(i)

# displaying data 
df_dup.head()

In [None]:
# make csv from df_dup dataframe
df_dup.to_csv("gwq-drop-dup-well_id-county.csv", index=None, header=True)