In [32]:
import pandas as pd 
import numpy as np 
from scipy import  stats 
import matplotlib.pyplot as plt 
import seaborn as sns
import os 


In [33]:
glorich_df = pd.read_csv("data/final_glorich_dataset.csv")
imputed_hydro = pd.read_csv("data/imputed_conditions_11-15.csv")

In [34]:
glorich_df.columns

Index(['STAT_ID', 'Latitude', 'Longitude', 'geometry', 'sc', 'ss', 'su', 'mt',
       'va', 'vb', 'vi', 'pa', 'pb', 'pi', 'GLC_Artificial', 'GLC_Managed',
       'GLC_Water', 'GLC_Aquatic_Veg', 'GLC_PERC_COV', 'Popdens_00', 'Soil_pH',
       'SOC', 'Soil_wetness', 'pH', 'SpecCond25C', 'Alkalinity', 'Cl', 'SO4',
       'DIP', 'date'],
      dtype='object')

In [35]:
glorich_df[['STAT_ID', 'Soil_wetness', 'date']].sort_values(by='STAT_ID').tail(10)

Unnamed: 0,STAT_ID,Soil_wetness,date
41917,401573,46.78,2007-04-01
20089,401573,46.78,2004-04-01
42962,401573,46.78,2007-05-01
14392,401573,46.78,2003-05-01
20323,401573,46.78,2004-05-01
14049,401573,46.78,2003-04-01
46278,401573,46.78,2007-12-01
3412,401573,46.78,2001-07-01
39048,401573,46.78,2006-11-01
46065,401573,46.78,2007-11-01


In [36]:
glorich_df = glorich_df.drop(columns = ['pH', 'SpecCond25C', 'Alkalinity', 'Cl', 'SO4',
       'DIP', 'date'])

glorich_df = glorich_df.groupby('STAT_ID').max()

In [37]:
glorich_df.head()

Unnamed: 0_level_0,Latitude,Longitude,geometry,sc,ss,su,mt,va,vb,vi,...,pi,GLC_Artificial,GLC_Managed,GLC_Water,GLC_Aquatic_Veg,GLC_PERC_COV,Popdens_00,Soil_pH,SOC,Soil_wetness
STAT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
400001,-32.31,18.33,POINT (18.33 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005508,0.0,100.0,4.75,6.7,3.08,34.46
400002,-32.31,18.34,POINT (18.34 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005508,0.0,100.0,4.75,6.7,3.08,34.46
400003,-32.31,18.34,POINT (18.34 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005511,0.0,100.0,4.7,6.7,3.08,34.46
400004,-32.31,18.35,POINT (18.35 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005511,0.0,100.0,4.7,6.7,3.08,34.46
400005,-32.34,18.42,POINT (18.42 -32.34),0.0,0.38,0.22,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.00283,0.0,100.0,4.46,6.7,3.12,34.46


In [38]:
imputed_hydro.head()

Unnamed: 0,STAT_ID,date,Alkalinity,Cl,DIP,SO4,SpecCond25C,pH,Alkalinity_reliability,Cl_reliability,DIP_reliability,SO4_reliability,SpecCond25C_reliability,pH_reliability
0,400020,2011-01-01,,,5.878731,,84.003333,7.796,,,0.7,,1.0,1.0
1,400020,2011-01-15,,,5.878731,,84.003333,7.796,,,0.7,,1.0,1.0
2,400020,2011-02-01,,,5.878731,,84.003333,7.796,,,0.7,,1.0,1.0
3,400020,2011-02-15,,,5.878731,,84.003333,7.796,,,0.7,,1.0,1.0
4,400020,2011-03-01,,,5.878731,,84.003333,7.796,,,0.7,,1.0,1.0


In [39]:
imputed_glorich_df = pd.merge(glorich_df.reset_index()[['STAT_ID', 'Latitude', 'Longitude']], imputed_hydro, on='STAT_ID', how='left')

imputed_glorich_df = imputed_glorich_df.sort_values('SpecCond25C_reliability', ascending=False).drop_duplicates(subset=['Latitude', 'Longitude', 'date'], keep='first')

imputed_glorich_df.head(30)

Unnamed: 0,STAT_ID,Latitude,Longitude,date,Alkalinity,Cl,DIP,SO4,SpecCond25C,pH,Alkalinity_reliability,Cl_reliability,DIP_reliability,SO4_reliability,SpecCond25C_reliability,pH_reliability
110444,401569,-28.38,29.01,2015-12-15,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110443,401569,-28.38,29.01,2015-12-01,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110442,401569,-28.38,29.01,2015-11-15,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110441,401569,-28.38,29.01,2015-11-01,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110440,401569,-28.38,29.01,2015-10-15,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110439,401569,-28.38,29.01,2015-10-01,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110438,401569,-28.38,29.01,2015-09-15,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110437,401569,-28.38,29.01,2015-09-01,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110436,401569,-28.38,29.01,2015-08-15,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0
110435,401569,-28.38,29.01,2015-08-01,1371.704215,209.698814,1.578395,79.4875,16.625249,7.789136,1.0,1.0,1.0,0.4,1.0,1.0


In [40]:
imputed_glorich_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105393 entries, 110444 to 110804
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   STAT_ID                  105393 non-null  int64  
 1   Latitude                 105393 non-null  float64
 2   Longitude                105393 non-null  float64
 3   date                     105120 non-null  object 
 4   Alkalinity               74880 non-null   float64
 5   Cl                       83520 non-null   float64
 6   DIP                      95880 non-null   float64
 7   SO4                      87720 non-null   float64
 8   SpecCond25C              99240 non-null   float64
 9   pH                       104040 non-null  float64
 10  Alkalinity_reliability   74880 non-null   float64
 11  Cl_reliability           83520 non-null   float64
 12  DIP_reliability          95880 non-null   float64
 13  SO4_reliability          87720 non-null   float64
 14  Spec

In [41]:
imputed_glorich_df.to_csv("data/final_imputed_hydrochem.csv", index=False)

In [42]:
glorich_df.columns

Index(['Latitude', 'Longitude', 'geometry', 'sc', 'ss', 'su', 'mt', 'va', 'vb',
       'vi', 'pa', 'pb', 'pi', 'GLC_Artificial', 'GLC_Managed', 'GLC_Water',
       'GLC_Aquatic_Veg', 'GLC_PERC_COV', 'Popdens_00', 'Soil_pH', 'SOC',
       'Soil_wetness'],
      dtype='object')

In [43]:
glorich_df = glorich_df.reset_index()
glorich_df.head()

Unnamed: 0,STAT_ID,Latitude,Longitude,geometry,sc,ss,su,mt,va,vb,...,pi,GLC_Artificial,GLC_Managed,GLC_Water,GLC_Aquatic_Veg,GLC_PERC_COV,Popdens_00,Soil_pH,SOC,Soil_wetness
0,400001,-32.31,18.33,POINT (18.33 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005508,0.0,100.0,4.75,6.7,3.08,34.46
1,400002,-32.31,18.34,POINT (18.34 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005508,0.0,100.0,4.75,6.7,3.08,34.46
2,400003,-32.31,18.34,POINT (18.34 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005511,0.0,100.0,4.7,6.7,3.08,34.46
3,400004,-32.31,18.35,POINT (18.35 -32.31),0.0,0.38,0.23,0.0,0.0,0.0,...,0.0,0.0,0.25,0.005511,0.0,100.0,4.7,6.7,3.08,34.46
4,400005,-32.34,18.42,POINT (18.42 -32.34),0.0,0.38,0.22,0.0,0.0,0.0,...,0.0,0.0,0.25,0.00283,0.0,100.0,4.46,6.7,3.12,34.46


In [44]:
glorich_df.to_csv("data/stations_with_conditions.csv", index=False)

In [45]:
# [['STAT_ID', 'Latitude', 'Longitude', 'geometry']]