In [120]:
import pandas as pd
import numpy as np
import joblib as jb
import tifffile
from pathlib import Path
from glob import glob
import matplotlib.pyplot as plt
from satellite_bathymetry.preprocessing import get_coord_from_pixel_pos, get_pixel_from_coord, ndwi, pixel_ndwi, pixel_log_ratio
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

import cv2
from scipy import stats

In [132]:
path = './generated/dataset_dataframe.pkl.z'
df = jb.load(path)
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,x,y,z,b1,b2,b3,b4,b5,b6,b7,b8,cspmb7,b2b4,b3b4,ndwi15,ndwi24,ndwi53
0,233,1130,3.195862,0.1199,0.0887,0.0692,0.0483,0.0518,0.0328,0.0315,0.0252,27.041417,1.156761,1.092734,0.396622,0.294890,-0.143802
1,233,1131,3.273030,0.1199,0.0886,0.0691,0.0484,0.0519,0.0335,0.0317,0.0254,27.274666,1.155853,1.091779,0.395809,0.293431,-0.142149
2,233,1132,3.299687,0.1199,0.0886,0.0690,0.0485,0.0519,0.0336,0.0320,0.0255,27.625527,1.155238,1.090825,0.395809,0.292487,-0.141439
3,233,1133,3.268182,0.1199,0.0885,0.0689,0.0484,0.0518,0.0336,0.0321,0.0256,27.742739,1.155562,1.091031,0.396622,0.292915,-0.141674
4,233,1134,3.278125,0.1199,0.0884,0.0688,0.0482,0.0517,0.0336,0.0321,0.0257,27.742739,1.156505,1.091822,0.397436,0.294290,-0.141909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18270,2293,86,2.264000,0.1262,0.1006,0.0789,0.0543,0.0466,0.0285,0.0270,0.0220,21.937274,1.154368,1.093542,0.460648,0.298903,-0.257371
18271,2294,86,1.909714,0.1256,0.1002,0.0785,0.0536,0.0452,0.0281,0.0267,0.0217,21.607165,1.157130,1.095829,0.470726,0.302991,-0.269200
18272,2295,86,1.539000,0.1256,0.0998,0.0782,0.0528,0.0447,0.0280,0.0265,0.0215,21.387827,1.160508,1.099019,0.475044,0.307995,-0.272579
18273,2295,87,1.360263,0.1256,0.1000,0.0783,0.0533,0.0454,0.0281,0.0267,0.0217,21.607165,1.158261,1.096735,0.469006,0.304631,-0.265966


In [135]:
columns = ['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8']

In [95]:
df_features = df[columns].copy()

In [96]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(df_features)

In [97]:
pd.Series(kmeans.labels_).value_counts()

0    12123
1     6152
dtype: int64

In [98]:
df_features['cluster'] = kmeans.labels_

In [99]:
df_dist = pd.DataFrame(kmeans.transform(df_features.drop(['cluster'],axis=1)))
df_dist.columns = ['norm_dist_to_0','norm_dist_to_1']
df_dist['norm_dist_to_0'] = 1/df_dist['norm_dist_to_0']
df_dist['norm_dist_to_1'] = 1/df_dist['norm_dist_to_1']
df_dist['sum'] = df_dist.sum(axis=1)
df_dist['norm_dist_to_0'] = df_dist['norm_dist_to_0']/df_dist['sum']
df_dist['norm_dist_to_1'] = df_dist['norm_dist_to_1']/df_dist['sum']
df_dist.drop(['sum'],axis=1,inplace=True)

In [100]:
df_features = pd.concat([df_features,df_dist],axis=1)

In [101]:
X_train, X_val, y_train, y_val = train_test_split(df_features, df['z'], test_size=0.3, random_state=42)

models = list()
for i in range(2):
    X_train_model = X_train[X_train['cluster'] == i]
    y_train_model = y_train[y_train.index.isin(X_train_model.index)]
    obj = RandomForestRegressor()
    obj.fit(X_train_model.drop(['norm_dist_to_0','norm_dist_to_1','cluster'],axis=1), y_train_model)
    models.append(obj)

In [102]:
predicts = list()
for i in range(2):
    predict = models[i].predict(X_val.drop(['norm_dist_to_0','norm_dist_to_1','cluster'],axis=1))*X_val[f'norm_dist_to_{i}']
    predicts.append(predict)

In [103]:
df_predicts = pd.DataFrame(predicts).transpose()
df_predicts['predict'] = df_predicts.sum(axis=1)

In [104]:
r2_2c = r2_score(y_val, df_predicts['predict'])
r2_2c

0.8452971199934864

In [139]:
columns = ['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8']
r2_dict = dict()
#for cluster_number in range(1,11,1):
for cluster_number in range(2,10,2):
    df_features = df[columns].copy()
    kmeans = KMeans(n_clusters=cluster_number, random_state=0).fit(df_features)
    df_features['cluster'] = kmeans.labels_

    df_dist = pd.DataFrame(kmeans.transform(df_features.drop(['cluster'],axis=1)))
    
    drop_list = list()
    for i in range(cluster_number):
        drop_list.append(f'norm_dist_to_{i}')
        
    df_dist.columns = drop_list
    for i in range(cluster_number):
        df_dist[f'norm_dist_to_{i}'] = 1/df_dist[f'norm_dist_to_{i}']
    df_dist['sum'] = df_dist.sum(axis=1)
    for i in range(cluster_number):
        df_dist[f'norm_dist_to_{i}'] = df_dist[f'norm_dist_to_{i}']/df_dist['sum']
    df_dist.drop(['sum'],axis=1,inplace=True)

    df_features = pd.concat([df_features,df_dist],axis=1)

    X_train, X_val, y_train, y_val = train_test_split(df_features, df['z'], test_size=0.3, random_state=42)

    
    drop_list.append('cluster')

    models = list()
    for i in range(cluster_number):
        X_train_model = X_train[X_train['cluster'] == i]
        y_train_model = y_train[y_train.index.isin(X_train_model.index)]
        X_train_model.drop(drop_list,axis=1)
        obj = RandomForestRegressor()
        obj.fit(X_train_model.drop(drop_list,axis=1), y_train_model)
        models.append(obj)

    predicts = list()
    for i in range(cluster_number):
        predict = models[i].predict(X_val.drop(drop_list,axis=1))*X_val[f'norm_dist_to_{i}']
        predicts.append(predict)

    df_predicts = pd.DataFrame(predicts).transpose()
    df_predicts['predict'] = df_predicts.sum(axis=1)

    result_dict = dict()
    result_dict['r2'] = r2_score(y_val, df_predicts['predict'])
    result_dict['mae'] = mean_absolute_error(y_val, df_predicts['predict'])
    result_dict['mse'] = mean_squared_error(y_val, df_predicts['predict'])
    result_dict['rmse'] = np.sqrt(mean_squared_error(y_val, df_predicts['predict']))
    result_dict['bias'] = df_predicts['predict'].mean() - y_val.mean()

    r2_dict[str(cluster_number)] = result_dict
r2_dict

{'2': {'r2': 0.871506393985028,
  'mae': 1.182586624394813,
  'mse': 2.592230881926908,
  'rmse': 1.6100406460480767,
  'bias': 0.10220399600218766},
 '4': {'r2': 0.7562321734871048,
  'mae': 1.750231180776629,
  'mse': 4.917773790497396,
  'rmse': 2.217605418125009,
  'bias': 0.6266397390927647},
 '6': {'r2': 0.7000209123636663,
  'mae': 1.8343346312412063,
  'mse': 6.051780154823857,
  'rmse': 2.4600366165616023,
  'bias': -0.09602301500612143},
 '8': {'r2': 0.6551015964615126,
  'mae': 2.0458236544085175,
  'mse': 6.957982739433596,
  'rmse': 2.6377988436257978,
  'bias': -0.060691367705141275}}