In [160]:
import pandas as pd
import numpy as np
import joblib as jb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [161]:
path = '../data/generated/df_newimages_bands_downside.pkl.z'

In [162]:
df = jb.load(path)
df.head()

Unnamed: 0,x,y,z,b1,b2,b3,b4,b5,b6,b7,b8,b2b4,b3b4,ndwi15,ndwi24,ndwi53,cspmb7
0,233,1130,3.195862,0.1199,0.0866,0.0667,0.0464,0.049,0.0316,0.0283,0.0238,1.162614,1.094573,0.419775,0.302256,-0.152982,23.382784
1,233,1131,3.27303,0.1199,0.088,0.0668,0.0457,0.049,0.0316,0.0283,0.0237,1.171434,1.099318,0.419775,0.31638,-0.153713,23.382784
2,233,1132,3.299687,0.1199,0.0879,0.0666,0.0461,0.0488,0.0324,0.0281,0.0238,1.168473,1.096035,0.421458,0.31194,-0.154246,23.158824
3,233,1133,3.268182,0.1199,0.0882,0.0692,0.0452,0.0488,0.0324,0.0281,0.0232,1.175411,1.111754,0.421458,0.322339,-0.172881,23.158824
4,233,1134,3.278125,0.1196,0.0884,0.0677,0.0454,0.0489,0.0323,0.0285,0.0238,1.174645,1.104724,0.419585,0.321375,-0.161235,23.607309


# Only Bands baseline

In [163]:
only_bands_features = df[['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8']]
only_bands_target = df.z

In [164]:
X_train, X_val, y_train, y_val = train_test_split(only_bands_features, only_bands_target, test_size=0.3, random_state=42)

In [165]:
rf_onlybands_baseline = RandomForestRegressor()
lgbm_onlybands_baseline = LGBMRegressor()

rf_onlybands_baseline.fit(X_train, y_train)
lgbm_onlybands_baseline.fit(X_train, y_train)

LGBMRegressor()

In [166]:
rf_onlybands_baseline_pred = rf_onlybands_baseline.predict(X_val)
lgbm_onlybands_baseline_pred = lgbm_onlybands_baseline.predict(X_val)

In [167]:
print('RF Only Bands Metrics:')
print('R2 score:', r2_score(y_val, rf_onlybands_baseline_pred))
print('MAE:', r2_score(y_val, rf_onlybands_baseline_pred))
print('MSE:', mean_absolute_error(y_val, rf_onlybands_baseline_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_val, rf_onlybands_baseline_pred)))
print()


print('LGBM Only Bands Metrics:')
print('R2 score:', r2_score(y_val, lgbm_onlybands_baseline_pred))
print('MAE:', r2_score(y_val, lgbm_onlybands_baseline_pred))
print('MSE:', mean_absolute_error(y_val, lgbm_onlybands_baseline_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_val, lgbm_onlybands_baseline_pred)))

RF Only Bands Metrics:
R2 score: 0.7932104521945285
MAE: 0.7932104521945285
MSE: 1.3752402911463757
RMSE: 2.048792092869514

LGBM Only Bands Metrics:
R2 score: 0.7127882613393819
MAE: 0.7127882613393819
MSE: 1.7833939544397204
RMSE: 2.414541590867869


In [215]:
pred_onlybands_df = pd.DataFrame(index=X_val.index)
pred_onlybands_df['rf_pred'] = rf_onlybands_baseline_pred
pred_onlybands_df['lgbm_pred'] = lgbm_onlybands_baseline_pred

In [216]:
train_data = pd.concat([X_train, y_train, df[df.index.isin(X_train.index)][['x','y']]], axis=1).reset_index().drop('index', axis=1)
val_data = pd.concat([X_val, y_val, df[df.index.isin(X_val.index)][['x','y']], pred_onlybands_df], axis=1)

In [217]:
jb.dump(rf_onlybands_baseline, '../data/generated_baseline/only_bands/model_rf_onlybands_baseline.pkl.z')
jb.dump(lgbm_onlybands_baseline, '../data/generated_baseline/only_bands/model_lgbm_onlybands_baseline.pkl.z')

['../data/generated_baseline/only_bands/model_lgbm_onlybands_baseline.pkl.z']

In [218]:
jb.dump(train_data, '../data/generated_baseline/only_bands/train_data_onlybands_baseline.pkl.z')
jb.dump(val_data, '../data/generated_baseline/only_bands/val_data_onlybands_baseline.pkl.z')

['../data/generated_baseline/only_bands/val_data_onlybands_baseline.pkl.z']

In [219]:
jb.dump(rf_onlybands_baseline_pred, '../data/generated_baseline/only_bands/pred_rf_onlybands_baseline.pkl.z')
jb.dump(lgbm_onlybands_baseline_pred, '../data/generated_baseline/only_bands/pred_lgbm_onlybands_baseline.pkl.z')

['../data/generated_baseline/only_bands/pred_lgbm_onlybands_baseline.pkl.z']

# Bands and CSPM

In [222]:
bands_cspm_features = df[['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'cspmb7']]
bands_cspm_target = df.z


X_train, X_val, y_train, y_val = train_test_split(bands_cspm_features, bands_cspm_target, test_size=0.3, random_state=42)

In [223]:
rf_bands_cspm_baseline = RandomForestRegressor()
lgbm_bands_cspm_baseline = LGBMRegressor()

rf_bands_cspm_baseline.fit(X_train, y_train)
lgbm_bands_cspm_baseline.fit(X_train, y_train)

LGBMRegressor()

In [224]:
pred_rf_bands_cspm_baseline = rf_bands_cspm_baseline.predict(X_val)
pred_lgbm_bands_cspm_baseline = lgbm_bands_cspm_baseline.predict(X_val)

In [225]:
print('RF Bands and CSPM Metrics:')
print('R2 score:', r2_score(y_val, pred_rf_bands_cspm_baseline))
print('MAE:', r2_score(y_val, pred_rf_bands_cspm_baseline))
print('MSE:', mean_absolute_error(y_val, pred_rf_bands_cspm_baseline))
print('RMSE:', np.sqrt(mean_squared_error(y_val, pred_rf_bands_cspm_baseline)))
print()


print('LGBM Bands and CSPM Metrics:')
print('R2 score:', r2_score(y_val, pred_lgbm_bands_cspm_baseline))
print('MAE:', r2_score(y_val, pred_lgbm_bands_cspm_baseline))
print('MSE:', mean_absolute_error(y_val, pred_lgbm_bands_cspm_baseline))
print('RMSE:', np.sqrt(mean_squared_error(y_val, pred_lgbm_bands_cspm_baseline)))

RF Bands and CSPM Metrics:
R2 score: 0.7954639875979043
MAE: 0.7954639875979043
MSE: 1.3710283784635608
RMSE: 2.0375979262824737

LGBM Bands and CSPM Metrics:
R2 score: 0.7127882613393819
MAE: 0.7127882613393819
MSE: 1.7833939544397204
RMSE: 2.414541590867869


In [226]:
pred_bands_cspm_df = pd.DataFrame(index=X_val.index)
pred_bands_cspm_df['rf_pred'] = pred_rf_bands_cspm_baseline
pred_bands_cspm_df['lgbm_pred'] = pred_lgbm_bands_cspm_baseline

In [227]:
train_data = pd.concat([X_train, y_train, df[df.index.isin(X_train.index)][['x','y', 'z']]], axis=1)
val_data = pd.concat([X_val, y_val, df[df.index.isin(X_val.index)][['x','y']], pred_bands_cspm_df], axis=1)

In [230]:
jb.dump(rf_bands_cspm_baseline, '../data/generated_baseline/bands_cspm/model_rf_bands_cspm_baseline.pkl.z')
jb.dump(lgbm_bands_cspm_baseline, '../data/generated_baseline/bands_cspm/model_lgbm_bands_cspm_baseline.pkl.z')

['../data/generated_baseline/bands_cspm/model_lgbm_bands_cspm_baseline.pkl.z']

In [231]:
jb.dump(train_data, '../data/generated_baseline/bands_cspm/train_data_bands_cspm_baseline.pkl.z')
jb.dump(val_data, '../data/generated_baseline/bands_cspm/val_data_bands_cspm_baseline.pkl.z')

['../data/generated_baseline/bands_cspm/val_data_bands_cspm_baseline.pkl.z']

In [232]:
jb.dump(pred_rf_bands_cspm_baseline, '../data/generated_baseline/bands_cspm/pred_rf_bands_cspm_baseline.pkl.z')
jb.dump(pred_lgbm_bands_cspm_baseline, '../data/generated_baseline/bands_cspm/pred_lgbm_bands_cspm_baseline.pkl.z')

['../data/generated_baseline/bands_cspm/pred_lgbm_bands_cspm_baseline.pkl.z']

# Bands, NDWI, Log Ratio and CSPM

In [233]:
'''
Used Features:
bands - (1 ~ 8)
ndwi - [(1,5), (2,4), (5,3)]
cspm - band 7
log ration - [(b2b4), (b3,b4)]
'''

'\nUsed Features:\nbands - (1 ~ 8)\nndwi - [(1,5), (2,4), (5,3)]\ncspm - band 7\nlog ration - [(b2b4), (b3,b4)]\n'

In [243]:
features = df.drop(['x','y','z'], axis=1)
target = df.z

X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.3, random_state=42)

In [244]:
rf_baseline = RandomForestRegressor()
lgbm_baseline = LGBMRegressor()

rf_baseline.fit(X_train, y_train)
lgbm_baseline.fit(X_train, y_train)

LGBMRegressor()

In [245]:
rf_baseline_pred = rf_baseline.predict(X_val)
lgbm_baseline_pred = lgbm_baseline.predict(X_val)

In [246]:
print('RF Metrics:')
print('R2 score:', r2_score(y_val, rf_baseline_pred))
print('MAE:', r2_score(y_val, rf_baseline_pred))
print('MSE:', mean_absolute_error(y_val, rf_baseline_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_val, rf_baseline_pred)))
print()


print('LGBM Metrics:')
print('R2 score:', r2_score(y_val, lgbm_baseline_pred))
print('MAE:', r2_score(y_val, lgbm_baseline_pred))
print('MSE:', mean_absolute_error(y_val, lgbm_baseline_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_val, lgbm_baseline_pred)))

RF Metrics:
R2 score: 0.7693323387348885
MAE: 0.7693323387348885
MSE: 1.4580140488472466
RMSE: 2.1638490132323045

LGBM Metrics:
R2 score: 0.7138790943384447
MAE: 0.7138790943384447
MSE: 1.7614190315809442
RMSE: 2.4099520025805754


In [247]:
pred_ndwi_logratio_df = pd.DataFrame(index=X_val.index)
pred_ndwi_logratio_df['rf_pred'] = rf_baseline_pred
pred_ndwi_logratio_df['lgbm_pred'] = lgbm_baseline_pred

In [249]:
train_data = pd.concat([X_train, y_train, df[df.index.isin(X_train.index)][['x','y']]], axis=1)
val_data = pd.concat([X_val, y_val, df[df.index.isin(X_val.index)][['x','y']], pred_ndwi_logratio_df], axis=1)

In [250]:
jb.dump(rf_baseline, '../data/generated_baseline/ndwi_and_log/model_rf_ndwi_log_baseline.pkl.z')
jb.dump(lgbm_baseline, '../data/generated_baseline/ndwi_and_log/model_lgbm_ndwi_log_baseline.pkl.z')

['../data/generated_baseline/ndwi_and_log/model_lgbm_ndwi_log_baseline.pkl.z']

In [251]:
jb.dump(train_data, '../data/generated_baseline/ndwi_and_log/train_data_ndwi_log_baseline.pkl.z')
jb.dump(val_data, '../data/generated_baseline/ndwi_and_log/val_data_ndwi_log_baseline.pkl.z')

['../data/generated_baseline/ndwi_and_log/val_data_ndwi_log_baseline.pkl.z']

In [252]:
jb.dump(rf_baseline_pred, '../data/generated_baseline/ndwi_and_log/pred_rf_ndwi_log_baseline.pkl.z')
jb.dump(lgbm_baseline_pred, '../data/generated_baseline/ndwi_and_log/pred_lgbm_ndwi_log_baseline.pkl.z')

['../data/generated_baseline/ndwi_and_log/pred_lgbm_ndwi_log_baseline.pkl.z']