In [132]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [133]:
# Load data
ts_df = pd.read_csv('Comprehensive_Training_Dataset.csv')
val_df = pd.read_csv('Comprehensive_Validation_Dataset.csv')

In [134]:
ts_df.columns

Index(['system:index', 'Dissolved Reactive Phosphorus',
       'Electrical Conductance', 'Index', 'Latitude', 'Longitude', 'MNDWI',
       'NDMI', 'Sample Date', 'Total Alkalinity', 'elevation', 'green',
       'humidity', 'modis_lc_class', 'nir', 'pet', 'population_3km',
       'precip_0d', 'precip_1d', 'precip_3d_sum', 'slope', 'soil_clay',
       'soil_ph', 'soil_sand', 'swir16', 'swir22', 'temp_c',
       'water_occurrence', '.geo'],
      dtype='object')

In [135]:
val_df.shape

(200, 20)

In [136]:
og_val_df = pd.read_csv('Validation_Dataset.csv')
og_val_df.head()

Unnamed: 0,Index,Latitude,Longitude,Sample Date,nir,green,swir16,swir22,NDMI,MNDWI,pet
0,1,-32.043333,27.822778,1/9/2014,15229.0,12868.0,14797.0,12421.0,0.014388,-0.069727,161.90001
1,2,-33.329167,26.0775,16-09-2015,,,,,,,177.6
2,3,-32.991639,27.640028,7/5/2015,16221.0,9304.5,12536.5,9958.0,0.128123,-0.147979,158.40001
3,4,-34.096389,24.439167,7/2/2012,,,,,,,130.0
4,5,-32.000556,28.581667,1/10/2014,9125.0,11100.5,9455.0,8711.0,-0.017761,0.080052,152.5


In [137]:
# Sorting columns by Index to get original placement
common_cols = set(val_df.columns).intersection(set(og_val_df.columns))
unique_cols = [col for col in og_val_df.columns if col not in common_cols]

# Merge keeping only unique columns from og_val_df
val_df = val_df.sort_values(by='Index')
val_df = val_df.merge(og_val_df[['Index'] + unique_cols], on='Index')
val_df.columns
# val_df.head()

Index(['system:index', 'Index', 'Latitude', 'Longitude', 'Sample Date',
       'elevation', 'humidity', 'modis_lc_class', 'pet', 'population_3km',
       'precip_0d', 'precip_1d', 'precip_3d_sum', 'slope', 'soil_clay',
       'soil_ph', 'soil_sand', 'temp_c', 'water_occurrence', '.geo', 'nir',
       'green', 'swir16', 'swir22', 'NDMI', 'MNDWI'],
      dtype='object')

In [138]:
val_df.shape

(200, 26)

In [139]:
# Confirming missing coluns from both data sets
missing_cols = set(ts_df.columns) - set(val_df.columns)
missing_cols



{'Dissolved Reactive Phosphorus', 'Electrical Conductance', 'Total Alkalinity'}

In [140]:
# Function fro droping columns that won't be used
def df_cleaner(df):
    df = df.sort_values(by = 'Index')
    df = df.drop(columns = ['system:index', 'Latitude', 'Longitude', 'Sample Date', '.geo', 'Index',
                            'nir', 'green', 'swir16'])
    return df

dfs = [ts_df, val_df]
cleaned_dfs = []
for df in dfs:
    print('\n' + '=' * 30)
    print('Cleaning DataFrames...')
    cleaned_df = df_cleaner(df)
    cleaned_dfs.append(cleaned_df)
    print(cleaned_df.info())


Cleaning DataFrames...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9319 entries, 0 to 9318
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Dissolved Reactive Phosphorus  9319 non-null   float64
 1   Electrical Conductance         9319 non-null   float64
 2   MNDWI                          8234 non-null   float64
 3   NDMI                           8234 non-null   float64
 4   Total Alkalinity               9319 non-null   float64
 5   elevation                      9319 non-null   int64  
 6   humidity                       9319 non-null   float64
 7   modis_lc_class                 9319 non-null   int64  
 8   pet                            9319 non-null   float64
 9   population_3km                 9319 non-null   float64
 10  precip_0d                      9319 non-null   float64
 11  precip_1d                      9319 non-null   float64
 12  precip_3d_sum           

In [141]:
ts_df = cleaned_dfs[0]
val_df = cleaned_dfs[1]

In [143]:
ts_df = ts_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)

In [146]:
val_df.columns

Index(['elevation', 'humidity', 'modis_lc_class', 'pet', 'population_3km',
       'precip_0d', 'precip_1d', 'precip_3d_sum', 'slope', 'soil_clay',
       'soil_ph', 'soil_sand', 'temp_c', 'water_occurrence', 'swir22', 'NDMI',
       'MNDWI'],
      dtype='object')

## TRAINING AND EVALUATION

In [147]:
feature_cols = ['elevation', 'humidity', 'modis_lc_class', 'pet', 'population_3km',
       'precip_0d', 'precip_1d', 'precip_3d_sum', 'slope', 'soil_clay',
       'soil_ph', 'soil_sand', 'temp_c', 'water_occurrence', 'swir22', 'NDMI',
       'MNDWI']

target_cols = ['Dissolved Reactive Phosphorus', 'Electrical Conductance', 'Total Alkalinity']

In [None]:
# Split Function

