In [1]:
import pandas as pd
import numpy as np
import joblib as jb
import tifffile
from pathlib import Path
from glob import glob
import matplotlib.pyplot as plt
from satellite_bathymetry.preprocessing import get_coord_from_pixel_pos, get_pixel_from_coord, ndwi, pixel_ndwi, pixel_log_ratio
from satellite_bathymetry.model_selection import cross_validation
from sklearn.preprocessing import KBinsDiscretizer, FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

from scipy import stats

In [2]:
df = jb.load('../data/generated/df_newimages_bands_downside.pkl.z')
df.head()

Unnamed: 0,x,y,z,b0,b1,b2,b3,b4,b5,b6,b7,b2b4,b3b4,ndwi15,ndwi24,ndwi53
0,226,1010,2.117692,0.1225,0.0919,0.0721,0.0516,0.0528,0.0337,0.0311,0.0256,1.078543,0.994204,0.463376,0.154524,-0.209848
1,227,1010,2.0965,0.1225,0.092,0.0712,0.0512,0.0528,0.0337,0.0311,0.0255,1.075376,0.992242,0.463803,0.148387,-0.206125
2,228,1010,2.197059,0.1221,0.0917,0.072,0.0508,0.0531,0.0332,0.0323,0.0255,1.076655,0.988852,0.468375,0.151079,-0.209524
3,229,1010,2.26875,0.1221,0.0915,0.0726,0.0505,0.0531,0.0332,0.0323,0.0256,1.078745,0.987361,0.467522,0.155131,-0.206691
4,229,1011,2.278235,0.1221,0.0909,0.0726,0.0517,0.0531,0.0332,0.0323,0.0255,1.078745,0.993273,0.464948,0.155131,-0.217903


In [4]:
coords_x = []
coords_y = []
for x, y in zip(df.x, df.y):
    coord_x, coord_y = get_coord_from_pixel_pos(x, y)
    coords_x.append(coord_x)
    coords_y.append(coord_y)

df['global_x'] = coords_x
df['global_y'] = coords_y

In [35]:
Kbins = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')

In [36]:
Kbins.fit(df[['global_x', 'global_y']])

KBinsDiscretizer(encode='ordinal', n_bins=20, strategy='uniform')

In [37]:
kbins_coords = Kbins.transform(df[['global_x', 'global_y']])
kbins_coords_df = pd.DataFrame(kbins_coords, columns=['kbins_coords_x','kbins_coords_y'])

In [38]:
df.drop(['global_x', 'global_y', 'x', 'y'], axis=1, inplace=True)

In [39]:
df2 = pd.concat([df, kbins_coords_df], axis=1)

In [40]:
features = df2.drop('z', axis=1)
target = df2.z

X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.3, random_state=42)

In [41]:
features

Unnamed: 0,b0,b1,b2,b3,b4,b5,b6,b7,b2b4,b3b4,ndwi15,ndwi24,ndwi53,kbins_coords_x,kbins_coords_y
0,0.1225,0.0919,0.0721,0.0516,0.0528,0.0337,0.0311,0.0256,1.078543,0.994204,0.463376,0.154524,-0.209848,0.0,7.0
1,0.1225,0.0920,0.0712,0.0512,0.0528,0.0337,0.0311,0.0255,1.075376,0.992242,0.463803,0.148387,-0.206125,0.0,7.0
2,0.1221,0.0917,0.0720,0.0508,0.0531,0.0332,0.0323,0.0255,1.076655,0.988852,0.468375,0.151079,-0.209524,0.0,7.0
3,0.1221,0.0915,0.0726,0.0505,0.0531,0.0332,0.0323,0.0256,1.078745,0.987361,0.467522,0.155131,-0.206691,0.0,7.0
4,0.1221,0.0909,0.0726,0.0517,0.0531,0.0332,0.0323,0.0255,1.078745,0.993273,0.464948,0.155131,-0.217903,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19692,0.1267,0.1030,0.0808,0.0558,0.0475,0.0308,0.0280,0.0215,1.137603,1.041713,0.539611,0.259548,-0.288684,19.0,19.0
19693,0.1267,0.1016,0.0813,0.0554,0.0482,0.0295,0.0288,0.0225,1.134900,1.035925,0.549962,0.255598,-0.305065,19.0,19.0
19694,0.1267,0.1010,0.0801,0.0541,0.0482,0.0295,0.0288,0.0232,1.131063,1.029797,0.547893,0.248636,-0.294258,19.0,19.0
19695,0.1267,0.1012,0.0798,0.0553,0.0482,0.0295,0.0288,0.0230,1.130095,1.035458,0.548585,0.246875,-0.304245,19.0,19.0


In [42]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [63]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [79]:
p_rf = rf.predict(X_val)
print('rf r2:', r2_score(y_val, p_rf))
print('rf mae:', mean_absolute_error(y_val, p_rf))
print('rf mse:', mean_squared_error(y_val, p_rf))
print('rf rmse:', np.sqrt(mean_squared_error(y_val, p_rf)))

rf r2: 0.9199488786691833
rf mae: 0.7818013562326143
rf mse: 1.6372744508181314
rf rmse: 1.2795602568140867


In [81]:
jb.dump(rf, './models/rf_binscoords_20.pkl.z')

['./models/rf_binscoords_20.pkl.z']

# Split before kbins

In [52]:
X_train, X_val, y_train, y_val = train_test_split(df.drop(['z', 'x', 'y'], axis=1), df.z, test_size=0.3, random_state=42)

In [53]:
Kbins = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
Kbins.fit(X_train[['global_x', 'global_y']])

KBinsDiscretizer(encode='ordinal', n_bins=20, strategy='uniform')

In [58]:
kbins_coords_train = Kbins.transform(X_train[['global_x', 'global_y']])
kbins_coords_df_train = pd.DataFrame(kbins_coords_train, columns=['kbins_coords_x','kbins_coords_y'], index=X_train.index)

kbins_coords_val = Kbins.transform(X_val[['global_x', 'global_y']])
kbins_coords_df_val = pd.DataFrame(kbins_coords_val, columns=['kbins_coords_x','kbins_coords_y'], index=X_val.index)

In [59]:
X_train2 = pd.concat([X_train, kbins_coords_df_train], axis=1).drop(['global_x', 'global_y'], axis=1)
X_val2 = pd.concat([X_val, kbins_coords_df_val], axis=1).drop(['global_x', 'global_y'], axis=1)

In [62]:
rf = RandomForestRegressor()
rf.fit(X_train2, y_train)

RandomForestRegressor()

In [69]:
p_rf = rf.predict(X_val2)
print('rf r2:', r2_score(y_val, p_rf))
print('rf mae:', mean_absolute_error(y_val, p_rf))
print('rf mse:', mean_squared_error(y_val, p_rf))
print('rf rmse:', np.sqrt(mean_squared_error(y_val, p_rf)))

rf r2: 0.9193636148324866
rf mae: 0.7856889615562174
rf mse: 1.6492447706697564
rf rmse: 1.2842292516018143
