In [27]:
import pandas as pd 
import numpy as np 


Two datasets: train+LandsatAPI.csv and landsat and hydrochem training.csv

We are merging them together for the finalized training dataset. 

In [28]:
landsat_df = pd.read_csv('data/train+landsatAPI.csv')
water_df = pd.read_csv('data/landsat_and_hydrochem_training.csv')

In [29]:
landsat_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus,pet,_merge_terra,nir,green,swir16,swir22,NDMI,MNDWI,_merge_landsat,Impute_Method
0,-34.40583,19.60056,2011-07-06,35.054,1224.0,10.0,158.7,both,9226.0,8526.0,8784.0,8359.0,0.024542,-0.014905,both,API_32d
1,-34.40583,19.60056,2011-08-02,36.247,1237.0,10.0,174.1,both,17890.0,10536.0,16607.0,13854.0,0.037192,-0.223667,both,API_16d
2,-34.40583,19.60056,2011-08-30,37.607,1034.0,10.0,174.1,both,6635.5,4251.0,6034.5,4820.5,0.047435,-0.173399,both,Original
3,-34.40583,19.60056,2011-09-27,27.082,929.0,108.0,174.1,both,18317.0,9145.5,13913.5,10727.5,0.136625,-0.206774,both,Original
4,-34.40583,19.60056,2011-10-27,40.357,1245.0,10.0,174.1,both,18317.0,9145.5,13913.5,10727.5,0.136625,-0.206774,both,Original


In [30]:
water_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,STAT_ID,dist_m,dist_km,Latitude_glorich,Longitude_glorich,sc,ss,...,SO4,SpecCond25C,pH,Alkalinity_reliability,Cl_reliability,DIP_reliability,SO4_reliability,SpecCond25C_reliability,pH_reliability,date_diff_days
0,-26.984722,26.632278,2015-12-23,401480,642.074296,0.642074,-26.98,26.63,0.31,0.16,...,1397.228644,88.611627,8.069654,0.4,0.7,0.7,1.0,1.0,1.0,8.0
1,-26.861111,28.884722,2015-12-23,401454,543.624662,0.543625,-26.86,28.88,0.0,0.54,...,651.088335,57.604498,7.8,1.0,1.0,0.4,1.0,1.0,0.4,8.0
2,-26.861111,28.884722,2011-01-03,401454,543.624662,0.543625,-26.86,28.88,0.0,0.54,...,651.088335,57.604498,7.8,1.0,1.0,0.4,1.0,1.0,0.4,2.0
3,-26.45,28.085833,2011-01-03,401494,649.32659,0.649327,-26.45,28.08,0.27,0.29,...,1716.721075,77.964187,7.435217,0.4,1.0,1.0,1.0,1.0,0.7,2.0
4,-27.671111,27.236944,2011-01-03,401550,785.515266,0.785515,-27.67,27.23,0.0,0.91,...,277.53879,35.916245,8.100416,0.4,1.0,1.0,1.0,1.0,1.0,2.0


In [31]:
landsat_df.columns

Index(['Latitude', 'Longitude', 'Sample Date', 'Total Alkalinity',
       'Electrical Conductance', 'Dissolved Reactive Phosphorus', 'pet',
       '_merge_terra', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI',
       '_merge_landsat', 'Impute_Method'],
      dtype='object')

In [32]:
print(f'landsat length: {len(landsat_df)}')
print(f'hydrochem df length: {len(water_df)}')
print(f'Unique landsat keys: {landsat_df[['Latitude', 'Longitude', 'Sample Date']].nunique()}')
print(f'Unique hydrochem keys: {water_df[['Latitude', 'Longitude', 'Sample Date']].nunique()}')

landsat length: 9319
hydrochem df length: 9319
Unique landsat keys: Latitude        161
Longitude       161
Sample Date    1364
dtype: int64
Unique hydrochem keys: Latitude        161
Longitude       161
Sample Date    1364
dtype: int64


In [33]:
landsat_keys = landsat_df[['Latitude', 'Longitude', 'Sample Date']].drop_duplicates()
water_keys = water_df[['Latitude', 'Longitude', 'Sample Date']].drop_duplicates()

print(len(landsat_keys))
print(len(water_keys))

9319
9319


In [34]:
water_df.isna().sum()

Latitude                     0
Longitude                    0
Sample Date                  0
STAT_ID                      0
dist_m                       0
dist_km                      0
Latitude_glorich           923
Longitude_glorich          923
sc                         923
ss                         923
su                         923
mt                         923
va                         923
vb                         923
vi                         923
pa                         923
pb                         923
pi                         923
GLC_Artificial             923
GLC_Managed                923
GLC_Water                  923
GLC_Aquatic_Veg            923
GLC_PERC_COV               923
Popdens_00                 923
Soil_pH                    923
SOC                        923
Soil_wetness               923
date                       923
Alkalinity                 987
Cl                         987
DIP                        987
SO4                        987
SpecCond

In [73]:
# Drop lat, lon, date from water_df before concat to avoid duplicates
landsat_cols = landsat_df.drop(columns=['Latitude', 'Longitude', 'Sample Date'])

# Concat side by side
merged_df = pd.concat([water_df, landsat_cols], axis=1)

print(f"Rows: {len(merged_df)}")
print(f"Columns: {merged_df.shape[1]}")
merged_df.head()

Rows: 9319
Columns: 58


Unnamed: 0,Latitude,Longitude,Sample Date,STAT_ID,dist_m,dist_km,Latitude_glorich,Longitude_glorich,sc,ss,...,nir,green,swir16,swir22,NDMI,MNDWI,_merge_landsat,Impute_Method,lat_key,lon_key
0,-34.405833,19.600556,2011-07-06,400538,61066.127691,61.066128,-33.99,19.82,0.0,1.0,...,9226.0,8526.0,8784.0,8359.0,0.024542,-0.014905,both,API_32d,-34.406,19.601
1,-34.405833,19.600556,2011-08-02,400538,61066.127691,61.066128,-33.99,19.82,0.0,1.0,...,17890.0,10536.0,16607.0,13854.0,0.037192,-0.223667,both,API_16d,-34.406,19.601
2,-34.405833,19.600556,2011-08-30,400538,61066.127691,61.066128,-33.99,19.82,0.0,1.0,...,6635.5,4251.0,6034.5,4820.5,0.047435,-0.173399,both,Original,-34.406,19.601
3,-34.405833,19.600556,2011-09-27,400538,61066.127691,61.066128,-33.99,19.82,0.0,1.0,...,18317.0,9145.5,13913.5,10727.5,0.136625,-0.206774,both,Original,-34.406,19.601
4,-34.405833,19.600556,2011-10-27,400538,61066.127691,61.066128,-33.99,19.82,0.0,1.0,...,18317.0,9145.5,13913.5,10727.5,0.136625,-0.206774,both,Original,-34.406,19.601
