In [1]:
import pandas as pd
import numpy as np
import requests
import seaborn as sns
import addfips
pd.set_option('display.max_columns', None)

from arcgis.gis import GIS
from arcgis.geocoding import geocode, reverse_geocode
from arcgis.geometry import Point

In [2]:
ca_stations = pd.read_csv('CA_stations_v2.0.csv')
nwis = pd.read_csv('NWIS_gwl_meta_v2.0.csv')
ngwmn = pd.read_csv('NGWMN_gwl_meta_v2.0.csv')

trends=pd.read_csv('GW_trendsout_v2.0.csv')

Using the ca_stations, nwis, and ngwmn datasets, pull all site information into dictionaries (lat, long, well_depth, aquifer code, gse-if available)

In [3]:
all_dict = {}
for i, row in ca_stations.iterrows():
    all_dict[row['site_code']] = {'latitude': row['latitude'], 'longitude': row['longitude'], 'gse': row['gse'], 'well_depth':row['well_depth']}
    
for i,row in nwis.iterrows():
    site_name = row['agency_cd'] + '.' + str(row['site_no'])
    all_dict[site_name] = {'latitude': row['dec_lat_va'], 'longitude':row['dec_long_va'], 'well_depth':row['well_depth_va'], 'aqfr_code': row['nat_aqfr_cd']}

In [4]:
count =0
for i,row in ngwmn.iterrows():
    site_replace = row['MY_SITEID'].replace(':', '.') 
    all_dict[site_replace] = {'latitude': row['DEC_LAT_VA'], 'longitude': row['DEC_LONG_V'], 'well_depth': row['WELL_DEPTH'], 'aqfr_code':row['NAT_AQUIFE']}

Create columns with this new information in the main 'trends' dataframe

In [5]:
trends['latitude'] = None
trends['longitude'] = None
trends['well_depth'] = None
trends['county_name'] = None
trends['county_fips'] = None

Add site infromation to each site in the trends dataset

In [None]:
count = 0
miss_list =[] #to see if there are sites in the trends dataset that were not in the ca_stations, nwis, or ngwmn datasets.

for i, row in trends.iterrows():
    count +=1
    if count % 1000 ==0:
        print(count)
        
    site_id = row['site_id']
    try:
        entry = all_dict[site_id]
    except:
        miss_list.append(site_id)
        continue
    
    trends.loc[i, 'latitude'] = entry['latitude']
    trends.loc[i, 'longitude'] = entry['longitude']
    trends.loc[i, 'well_depth'] = entry['well_depth']


Use the location data to add the county and county fips code to each site in the trends dataset

In [None]:
gis = GIS()
count = 0
no_fips = []
for i, row in trends.iterrows():
    count +=1
    if count % 1000 ==0:
        print(count)
    
    if row['site_id'] in miss_list:
        continue
    
    # Latitude and Longitude coordinates
    latitude = trends.loc[i, 'latitude']
    longitude = trends.loc[i, 'longitude'] 
    
    #create a point object for ArcGIS
    point = Point({'x': longitude, 'y': latitude})
    result = reverse_geocode(point) 
    
    #pull the county information and update the trends dataset
    county = result['address']['Subregion']
    st =  result['address']['Region']
    af = addfips.AddFIPS()
    fips = af.get_county_fips(county, state=st)
    
    trends.loc[i, 'county_name'] = county
    trends.loc[i, 'county_fips'] = fips
    

## Filter trends -- get rid of duplicates, filter by year cutoff

We can change how we to set the year cutoffs here.

In [9]:
#filter the years
trend_filt = trends[(abs(trends['trend_period_start_yr']-2000)<4) & (trends['trend_period_end_yr']>2016)]

#we want to select the 'mean' water level as the data point of interest
trend_filt = trend_filt[trend_filt['metric_id'] == 'mean']

#if the site id, slope, decadal change, and start year are the exact same, we can filter to just include one of them
trend_filt = trend_filt.drop_duplicates(subset = ['site_id', 'slope', 'decadal_change', 'trend_period_start_yr'])

Filter multiple data entries for the same site. We want to the shortest time interval possible for each 

In [10]:
dups = trend_filt[trend_filt.duplicated(subset=['site_id']) == True]
for site in dups['site_id'].unique():
    ids = trend_filt.index[trend_filt['site_id'] == site].tolist() #find index vals in the trends dataframe for each site_id
    
    #find start years for the repeated site_id entries, and add to dictionary
    comp_dict = {}
    for i in ids:
        start_year= trend_filt.loc[i, 'trend_period_start_yr']
        comp_dict[i] = start_year
    
    #find the most recent start year for the site_id entries
    maxval = 1
    max_entry = None
    for entry in comp_dict.keys():
        if int(comp_dict[entry]) > maxval:
            max_entry = entry
            
    #select and drop the duplicates we don't want        
    drop_list = []
    for entry in comp_dict:
        if entry != max_entry:
            drop_list.append(entry)
    trend_filt.drop(drop_list,axis = 0,inplace = True)
trend_filt.reset_index(drop = True, inplace = True)

export the dataframe

In [38]:
trend_filt.to_csv('merge_data/trends_clean.csv', index = False)