In [1]:
import pandas as pd
import numpy as np
import joblib as jb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from satellite_bathymetry.preprocessing import get_coord_from_pixel_pos, get_pixel_from_coord, ndwi, pixel_ndwi, pixel_log_ratio
from satellite_bathymetry.model_tunning import tune_tree_model
import xgboost as XGB
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from skopt import forest_minimize

In [2]:
median_filter_path = 'generated/dataset_dataframe.pkl.z'

In [3]:
train_path = 'generated/train_window_split_RAW.pkl.z'
val_path = 'generated/val_window_split_RAW.pkl.z'

train = jb.load(train_path)
val = jb.load(val_path)
#df['ndwi51'] =  df[['b5', 'b1']].apply(lambda x: pixel_ndwi(x.b5, x.b1), axis=1)
val.head()

Unnamed: 0,x,y,z,b1,b2,b3,b4,b5,b6,b7,b8,b2b4,b3b4,ndwi15,ndwi24,ndwi53,cspmb7,ndwi51
8,234,1003,2.23,0.1224,0.0917,0.0725,0.0509,0.0554,0.0345,0.0331,0.0264,1.149791,1.090009,0.376828,0.286115,-0.133698,28.922021,-0.376828
9,234,1004,2.229857,0.1224,0.0917,0.0734,0.0514,0.054,0.0336,0.0323,0.0264,1.146938,1.090436,0.387755,0.281621,-0.152276,27.977559,-0.387755
10,234,1005,2.229851,0.1224,0.0919,0.0732,0.0509,0.054,0.0336,0.0323,0.0265,1.150346,1.092454,0.387755,0.287115,-0.150943,27.977559,-0.387755
11,234,1006,2.23,0.1224,0.0925,0.0722,0.0505,0.0536,0.0329,0.0328,0.0257,1.154319,1.091145,0.390909,0.293706,-0.147854,28.566881,-0.390909
12,234,1007,2.23,0.1224,0.092,0.0717,0.0517,0.0536,0.0329,0.0328,0.0254,1.146075,1.082888,0.390909,0.280445,-0.144453,28.566881,-0.390909


In [4]:
df_median = jb.load(median_filter_path)

In [5]:
columns = ['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8']

In [6]:
X_train, X_val, y_train, y_val = train_test_split(df_median[columns], df_median.z, test_size=0.3, random_state=42)

In [7]:
xg = XGBRegressor()
xg.fit(X_train, y_train)
p_xg = xg.predict(X_val)
print('XGBoost Bands:')
print('R2 score:', r2_score(y_val, p_xg))
print('Mean Absolute Error:', mean_absolute_error(y_val, p_xg))
print('Mean Squared Error:', mean_squared_error(y_val, p_xg))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_val, p_xg)))
print('Bias:', p_xg.mean() - y_val.mean())

XGBoost Bands:
R2 score: 0.9113324866743583
Mean Absolute Error: 0.8632484611379012
Mean Squared Error: 1.7887790170634044
Root Mean Squared Error: 1.3374524354396324
Bias: 0.04071945245236286


In [8]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
lgbm_p = lgbm.predict(X_val)
print('XGBoost Bands:')
print('R2 score:', r2_score(y_val, lgbm_p))
print('Mean Absolute Error:', mean_absolute_error(y_val, lgbm_p))
print('Mean Squared Error:', mean_squared_error(y_val, lgbm_p))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_val, lgbm_p)))
print('Bias:', lgbm_p.mean() - y_val.mean())

XGBoost Bands:
R2 score: 0.8620653369304733
Mean Absolute Error: 1.1711619251743357
Mean Squared Error: 2.782694831175862
Root Mean Squared Error: 1.6681411304730371
Bias: 0.03185720183167984


## Bayesian Optimization

In [69]:
def tune_lgbm(args):
    ## Model Parameters
    print(args)
    lr = args[0]
    max_depth = args[1]
    min_child_samples = args[2]
    subsample = args[3]
    colsample_bytree = args[4]
    n_estimators = args[5]
      
    mdl = LGBMRegressor(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                        min_child_samples=min_child_samples, subsample=subsample,
                        colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                        random_state=0, class_weight='balanced', n_jobs=6)
    
    mdl.fit(X_train, y_train)
    
    p = mdl.predict(X_val)
    
    print(r2_score(y_val, p))
    
    return -r2_score(y_val, p) ## To maximize ap, return the negative to the minimize func

In [None]:
space_lgbm = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space_lgbm, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)
res['x']

In [10]:
# Results
args_lgbm = [0.06189835094365267, 9, 1, 0.8695551533271082, 0.6534274736020848, 976, 2, 1]

In [11]:
lr = args_lgbm[0]
max_depth = args_lgbm[1]
min_child_samples = args_lgbm[2]
subsample = args_lgbm[3]
colsample_bytree = args_lgbm[4]
n_estimators = args_lgbm[5]

mdl = LGBMRegressor(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                    min_child_samples=min_child_samples, subsample=subsample,
                    colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                    random_state=0, class_weight='balanced', n_jobs=6)

In [12]:
mdl.fit(X_train, y_train)



LGBMRegressor(bagging_freq=1, class_weight='balanced',
              colsample_bytree=0.6534274736020848,
              learning_rate=0.06189835094365267, max_depth=9,
              min_child_samples=1, n_estimators=976, n_jobs=6, num_leaves=512,
              random_state=0, subsample=0.8695551533271082)

In [13]:
p = mdl.predict(X_val)

In [14]:
r2_score(y_val, p)

0.94886132833864

## Cross Validation

In [15]:
from satellite_bathymetry.model_selection import cross_validation

In [16]:
features = df_median[columns]
target = df_median.z

In [17]:
lgbm = LGBMRegressor()

In [None]:
scores = cross_validation(features=features, target=target, metric=r2_score, mdl=lgbm, n_splits=10, random_state=42, learning_rate=args_lgbm[0], max_depth=args_lgbm[1],
                         min_child_samples=args_lgbm[2], subsample=args_lgbm[3], colsample_bytree=args_lgbm[4], n_estimatores=args_lgbm[5])

In [27]:
scores

{'k_fold_0': 0.8200959490579324,
 'k_fold_1': 0.8375784704161826,
 'k_fold_2': 0.8189095166479764,
 'k_fold_3': 0.8094356464582203,
 'k_fold_4': 0.8251823463786476,
 'k_fold_5': 0.8122298311875853,
 'k_fold_6': 0.8228085058473531,
 'k_fold_7': 0.8296706791578056,
 'k_fold_8': 0.8211683467538229,
 'k_fold_9': 0.8087238372082977}

# Less data

In [29]:
lr = args_lgbm[0]
max_depth = args_lgbm[1]
min_child_samples = args_lgbm[2]
subsample = args_lgbm[3]
colsample_bytree = args_lgbm[4]
n_estimators = args_lgbm[5]

mdl = LGBMRegressor(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                    min_child_samples=min_child_samples, subsample=subsample,
                    colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                    random_state=0, class_weight='balanced', n_jobs=6)

In [91]:
X_train, X_val, y_train, y_val = train_test_split(df_median[columns], df_median.z, test_size=.5, random_state=42)

In [92]:
len(X_train), len(X_val)

(9137, 9138)

In [93]:
mdl.fit(X_train, y_train)



LGBMRegressor(bagging_freq=1, class_weight='balanced',
              colsample_bytree=0.6534274736020848,
              learning_rate=0.06189835094365267, max_depth=9,
              min_child_samples=1, n_estimators=976, n_jobs=6, num_leaves=512,
              random_state=0, subsample=0.8695551533271082)

In [94]:
p = mdl.predict(X_val)

In [95]:
r2_score(y_val, p)

0.9341189556007882

In [63]:
train = X_train.copy()
val = X_val.copy()
train['x'] = df_median[df_median.index.isin(train.index)].x
train['y'] = df_median[df_median.index.isin(train.index)].y

val['x'] = df_median[df_median.index.isin(val.index)].x
val['y'] = df_median[df_median.index.isin(val.index)].y

In [40]:
import cv2

rgb_image = cv2.imread('../data/generated/rgb_image.jpg')
#image = rgb_image.copy()
for i, row in enumerate(train.itertuples()):
    x = int(row.x)
    y = int(row.y)
    rgb_image = cv2.circle(rgb_image ,(x,y),1,[255,0,0],1)
cv2.imwrite('generated/train_09_random.jpg', rgb_image)

True

In [41]:
import cv2

rgb_image = cv2.imread('../data/generated/rgb_image.jpg')
#image = rgb_image.copy()
for i, row in enumerate(val.itertuples()):
    x = int(row.x)
    y = int(row.y)
    rgb_image = cv2.circle(rgb_image ,(x,y),1,[255,0,0],1)
cv2.imwrite('generated/val_09_random.jpg', rgb_image)

True