In [18]:
import pandas as pd
import numpy as np
import joblib as jb
import tifffile
from pathlib import Path
from glob import glob
import matplotlib.pyplot as plt
from satellite_bathymetry.preprocessing import get_coord_from_pixel_pos, get_pixel_from_coord, ndwi, pixel_ndwi, pixel_log_ratio
from satellite_bathymetry.model_selection import cross_validation
from sklearn.preprocessing import KBinsDiscretizer, FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

from scipy import stats

In [19]:
df = jb.load('../data/generated/df_newimages_bands_downside.pkl.z')
df.head()

Unnamed: 0,x,y,z,b1,b2,b3,b4,b5,b6,b7,b8,b2b4,b3b4,ndwi15,ndwi24,ndwi53,cspmb7
0,233,1130,3.195862,0.1199,0.0866,0.0667,0.0464,0.049,0.0316,0.0283,0.0238,1.162614,1.094573,0.419775,0.302256,-0.152982,23.382784
1,233,1131,3.27303,0.1199,0.088,0.0668,0.0457,0.049,0.0316,0.0283,0.0237,1.171434,1.099318,0.419775,0.31638,-0.153713,23.382784
2,233,1132,3.299687,0.1199,0.0879,0.0666,0.0461,0.0488,0.0324,0.0281,0.0238,1.168473,1.096035,0.421458,0.31194,-0.154246,23.158824
3,233,1133,3.268182,0.1199,0.0882,0.0692,0.0452,0.0488,0.0324,0.0281,0.0232,1.175411,1.111754,0.421458,0.322339,-0.172881,23.158824
4,233,1134,3.278125,0.1196,0.0884,0.0677,0.0454,0.0489,0.0323,0.0285,0.0238,1.174645,1.104724,0.419585,0.321375,-0.161235,23.607309


In [20]:
coords_x = []
coords_y = []
for x, y in zip(df.x, df.y):
    coord_x, coord_y = get_coord_from_pixel_pos(x, y)
    coords_x.append(coord_x)
    coords_y.append(coord_y)

df['global_x'] = coords_x
df['global_y'] = coords_y

In [21]:
# Split before bucketize
X_train, X_val, y_train, y_val = train_test_split(df.drop(['z', 'x', 'y'], axis=1), df.z, test_size=0.3, random_state=42)

In [22]:
Kbins = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
Kbins.fit(X_train[['global_x', 'global_y']])

KBinsDiscretizer(encode='ordinal', n_bins=20, strategy='uniform')

In [23]:
kbins_coords_train = Kbins.transform(X_train[['global_x', 'global_y']])
kbins_coords_df_train = pd.DataFrame(kbins_coords_train, columns=['kbins_coords_x','kbins_coords_y'], index=X_train.index)

kbins_coords_val = Kbins.transform(X_val[['global_x', 'global_y']])
kbins_coords_df_val = pd.DataFrame(kbins_coords_val, columns=['kbins_coords_x','kbins_coords_y'], index=X_val.index)

In [24]:
X_train = pd.concat([X_train, kbins_coords_df_train], axis=1).drop(['global_x', 'global_y'], axis=1)
X_val = pd.concat([X_val, kbins_coords_df_val], axis=1).drop(['global_x', 'global_y'], axis=1)

In [25]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [26]:
p_rf = rf.predict(X_val)
print('rf r2:', r2_score(y_val, p_rf))
print('rf mae:', mean_absolute_error(y_val, p_rf))
print('rf mse:', mean_squared_error(y_val, p_rf))
print('rf rmse:', np.sqrt(mean_squared_error(y_val, p_rf)))

rf r2: 0.9284032397042754
rf mae: 0.7114590513797363
rf mse: 1.4533177117595588
rf rmse: 1.2055362755884034


In [45]:
train_data = X_train.copy()
val_data = X_val.copy()

train_data['x'] = df[df.index.isin(X_train.index)].x
train_data['y'] = df[df.index.isin(X_train.index)].y
train_data['z'] = df[df.index.isin(X_train.index)].z


val_data['x'X] = df[df.index.isin(X_val.index)].x
val_data['y'] = df[df.index.isin(X_val.index)].y
val_data['z'] = df[df.index.isin(X_val.index)].z

In [47]:
# Dump data
jb.dump(X_train, 'output_data/train_data_bestmodel.pkl.z')

jb.dump(X_val, 'output_data/val_data_bestmodel.pkl.z')

jb.dump(p_rf, 'output_data/predict_rf_best.pkl.z')

['output_data/predict_rf_best.pkl.z']

In [49]:
# Save model
jb.dump(rf, 'models/rf_binscoords_20.pkl.z')

['models/rf_binscoords_20.pkl.z']

In [50]:
# Baseline

In [54]:
X_train_base = X_train.drop(['kbins_coords_x', 'kbins_coords_y', 'x', 'y', 'z'], axis=1)
y_train_base = X_train.z

X_val_base = X_val.drop(['kbins_coords_x', 'kbins_coords_y', 'x', 'y', 'z'], axis=1)
y_val_base = X_val.z

In [55]:
rf_base = RandomForestRegressor()
rf_base.fit(X_train_base, y_train_base)

RandomForestRegressor()

In [60]:
p_base = rf_base.predict(X_val_base)
print('rf r2:', r2_score(y_val, p_base))
print('rf mae:', mean_absolute_error(y_val, p_base))
print('rf mse:', mean_squared_error(y_val, p_base))
print('rf rmse:', np.sqrt(mean_squared_error(y_val, p_base)))

rf r2: 0.7697874495529399
rf mae: 1.4612612532291767
rf mse: 4.673004416011722
rf rmse: 2.161713305693362


In [61]:
jb.dump(p_base, 'output_data/predict_rf_baseline.pkl.z')

['output_data/predict_rf_baseline.pkl.z']