In [1]:
import pandas as pd
import numpy as np
import joblib as jb
import tifffile
from pathlib import Path
from glob import glob
import matplotlib.pyplot as plt
from satellite_bathymetry.preprocessing import get_coord_from_pixel_pos, get_pixel_from_coord, ndwi, pixel_ndwi, pixel_log_ratio
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

import cv2
from scipy import stats

In [2]:
df_data = jb.load('../data/generated/df_newimages_bands_downside.pkl.z')
df_data.head(3)

Unnamed: 0,x,y,z,b1,b2,b3,b4,b5,b6,b7,b8,b2b4,b3b4,ndwi15,ndwi24,ndwi53,cspmb7
0,233,1130,3.195862,0.1199,0.0866,0.0667,0.0464,0.049,0.0316,0.0283,0.0238,1.162614,1.094573,0.419775,0.302256,-0.152982,23.382784
1,233,1131,3.27303,0.1199,0.088,0.0668,0.0457,0.049,0.0316,0.0283,0.0237,1.171434,1.099318,0.419775,0.31638,-0.153713,23.382784
2,233,1132,3.299687,0.1199,0.0879,0.0666,0.0461,0.0488,0.0324,0.0281,0.0238,1.168473,1.096035,0.421458,0.31194,-0.154246,23.158824


In [3]:
df_features = df_data.drop(['x','y','z','b2b4','b3b4','ndwi15','ndwi24','ndwi53','cspmb7'],axis=1)
df_features.head(3)

Unnamed: 0,b1,b2,b3,b4,b5,b6,b7,b8
0,0.1199,0.0866,0.0667,0.0464,0.049,0.0316,0.0283,0.0238
1,0.1199,0.088,0.0668,0.0457,0.049,0.0316,0.0283,0.0237
2,0.1199,0.0879,0.0666,0.0461,0.0488,0.0324,0.0281,0.0238


In [4]:
kmeans = KMeans(n_clusters=8, random_state=0).fit(df_features)

In [5]:
pd.Series(kmeans.labels_).value_counts()


7    4159
6    3882
0    3648
3    2568
5    2366
1    1253
4    1014
2     397
dtype: int64

In [6]:
df_features['cluster'] = kmeans.labels_
df_features.head(3)

Unnamed: 0,b1,b2,b3,b4,b5,b6,b7,b8,cluster
0,0.1199,0.0866,0.0667,0.0464,0.049,0.0316,0.0283,0.0238,1
1,0.1199,0.088,0.0668,0.0457,0.049,0.0316,0.0283,0.0237,1
2,0.1199,0.0879,0.0666,0.0461,0.0488,0.0324,0.0281,0.0238,1


In [7]:
#min_max_scaler = MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(kmeans.transform(df_features.drop(['cluster'],axis=1)))
#df_dist = pd.DataFrame(x_scaled)
#df_dist.columns = ['norm_dist_to_0','norm_dist_to_1','norm_dist_to_2','norm_dist_to_3']
#df_dist

#min_max_scaler = MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform())
df_dist = pd.DataFrame(kmeans.transform(df_features.drop(['cluster'],axis=1)))
df_dist.columns = ['norm_dist_to_0','norm_dist_to_1','norm_dist_to_2','norm_dist_to_3','norm_dist_to_4','norm_dist_to_5','norm_dist_to_6','norm_dist_to_7']
df_dist['norm_dist_to_0'] = 1/df_dist['norm_dist_to_0']
df_dist['norm_dist_to_1'] = 1/df_dist['norm_dist_to_1']
df_dist['norm_dist_to_2'] = 1/df_dist['norm_dist_to_2']
df_dist['norm_dist_to_3'] = 1/df_dist['norm_dist_to_3']
df_dist['norm_dist_to_4'] = 1/df_dist['norm_dist_to_4']
df_dist['norm_dist_to_5'] = 1/df_dist['norm_dist_to_5']
df_dist['norm_dist_to_6'] = 1/df_dist['norm_dist_to_6']
df_dist['norm_dist_to_7'] = 1/df_dist['norm_dist_to_7']
df_dist['sum'] = df_dist.sum(axis=1)
df_dist['norm_dist_to_0'] = df_dist['norm_dist_to_0']/df_dist['sum']
df_dist['norm_dist_to_1'] = df_dist['norm_dist_to_1']/df_dist['sum']
df_dist['norm_dist_to_2'] = df_dist['norm_dist_to_2']/df_dist['sum']
df_dist['norm_dist_to_3'] = df_dist['norm_dist_to_3']/df_dist['sum']
df_dist['norm_dist_to_4'] = df_dist['norm_dist_to_4']/df_dist['sum']
df_dist['norm_dist_to_5'] = df_dist['norm_dist_to_5']/df_dist['sum']
df_dist['norm_dist_to_6'] = df_dist['norm_dist_to_6']/df_dist['sum']
df_dist['norm_dist_to_7'] = df_dist['norm_dist_to_7']/df_dist['sum']
df_dist.drop(['sum'],axis=1,inplace=True)
df_dist



Unnamed: 0,norm_dist_to_0,norm_dist_to_1,norm_dist_to_2,norm_dist_to_3,norm_dist_to_4,norm_dist_to_5,norm_dist_to_6,norm_dist_to_7
0,0.139202,0.273595,0.053607,0.103411,0.085760,0.109301,0.105131,0.129993
1,0.138811,0.278693,0.052129,0.103677,0.083938,0.108791,0.103065,0.130896
2,0.137845,0.277126,0.053004,0.103544,0.085273,0.109619,0.102975,0.130613
3,0.135032,0.292345,0.050069,0.101692,0.082089,0.109484,0.096739,0.132551
4,0.135753,0.284830,0.052010,0.102787,0.084296,0.109515,0.099970,0.130839
...,...,...,...,...,...,...,...,...
19282,0.080363,0.113086,0.057635,0.076994,0.122502,0.361409,0.056214,0.131796
19283,0.072224,0.106055,0.052007,0.068500,0.115031,0.416236,0.050242,0.119706
19284,0.066391,0.101995,0.044925,0.062679,0.099794,0.465314,0.045480,0.113422
19285,0.064920,0.099478,0.045298,0.060937,0.102142,0.473311,0.044729,0.109184


In [8]:
df_dist.iloc[6383]


norm_dist_to_0    0.350297
norm_dist_to_1    0.101664
norm_dist_to_2    0.026928
norm_dist_to_3    0.134423
norm_dist_to_4    0.042849
norm_dist_to_5    0.073909
norm_dist_to_6    0.098462
norm_dist_to_7    0.171468
Name: 6383, dtype: float64

In [9]:
df_features = pd.concat([df_features,df_dist],axis=1)
df_features

Unnamed: 0,b1,b2,b3,b4,b5,b6,b7,b8,cluster,norm_dist_to_0,norm_dist_to_1,norm_dist_to_2,norm_dist_to_3,norm_dist_to_4,norm_dist_to_5,norm_dist_to_6,norm_dist_to_7
0,0.1199,0.0866,0.0667,0.0464,0.0490,0.0316,0.0283,0.0238,1,0.139202,0.273595,0.053607,0.103411,0.085760,0.109301,0.105131,0.129993
1,0.1199,0.0880,0.0668,0.0457,0.0490,0.0316,0.0283,0.0237,1,0.138811,0.278693,0.052129,0.103677,0.083938,0.108791,0.103065,0.130896
2,0.1199,0.0879,0.0666,0.0461,0.0488,0.0324,0.0281,0.0238,1,0.137845,0.277126,0.053004,0.103544,0.085273,0.109619,0.102975,0.130613
3,0.1199,0.0882,0.0692,0.0452,0.0488,0.0324,0.0281,0.0232,1,0.135032,0.292345,0.050069,0.101692,0.082089,0.109484,0.096739,0.132551
4,0.1196,0.0884,0.0677,0.0454,0.0489,0.0323,0.0285,0.0238,1,0.135753,0.284830,0.052010,0.102787,0.084296,0.109515,0.099970,0.130839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19282,0.1267,0.1030,0.0808,0.0558,0.0475,0.0308,0.0280,0.0215,5,0.080363,0.113086,0.057635,0.076994,0.122502,0.361409,0.056214,0.131796
19283,0.1267,0.1016,0.0813,0.0554,0.0482,0.0295,0.0288,0.0225,5,0.072224,0.106055,0.052007,0.068500,0.115031,0.416236,0.050242,0.119706
19284,0.1267,0.1010,0.0801,0.0541,0.0482,0.0295,0.0288,0.0232,5,0.066391,0.101995,0.044925,0.062679,0.099794,0.465314,0.045480,0.113422
19285,0.1267,0.1012,0.0798,0.0553,0.0482,0.0295,0.0288,0.0230,5,0.064920,0.099478,0.045298,0.060937,0.102142,0.473311,0.044729,0.109184


In [10]:
X_train, X_val, y_train, y_val = train_test_split(df_features, df_data['z'], test_size=0.3, random_state=42)

models = list()
for i in range(8):
    X_train_model = X_train[X_train['cluster'] == i]
    y_train_model = y_train[y_train.index.isin(X_train_model.index)]
    obj = RandomForestRegressor()
    obj.fit(X_train_model.drop(['norm_dist_to_0','norm_dist_to_1','norm_dist_to_2','norm_dist_to_3','norm_dist_to_4','norm_dist_to_5','norm_dist_to_6','norm_dist_to_7','cluster'],axis=1), y_train_model)
    models.append(obj)

In [11]:
predicts = list()
for i in range(8):
    predict = models[i].predict(X_val.drop(['norm_dist_to_0','norm_dist_to_1','norm_dist_to_2','norm_dist_to_3','norm_dist_to_4','norm_dist_to_5','norm_dist_to_6','norm_dist_to_7','cluster'],axis=1))*X_val[f'norm_dist_to_{i}']
    predicts.append(predict)

In [12]:
predicts[0]

6383     1.777944
10019    2.915734
8380     1.123728
14471    1.513388
13091    0.676814
           ...   
13345    2.118301
11465    3.268416
4424     0.777653
17965    1.573796
17246    0.515130
Name: norm_dist_to_0, Length: 5787, dtype: float64

In [13]:
df_predicts = pd.DataFrame(predicts).transpose()
df_predicts['predict'] = df_predicts.sum(axis=1)
df_predicts

Unnamed: 0,norm_dist_to_0,norm_dist_to_1,norm_dist_to_2,norm_dist_to_3,norm_dist_to_4,norm_dist_to_5,norm_dist_to_6,norm_dist_to_7,predict
6383,1.777944,0.510693,0.204141,1.319011,0.349757,0.701966,0.616670,1.451705,6.931886
10019,2.915734,0.334282,0.185505,1.505665,0.314953,0.639546,0.576252,1.568969,8.040906
8380,1.123728,0.450355,0.206945,1.003573,0.258676,0.950463,0.378080,3.077433,7.449253
14471,1.513388,0.334023,0.221275,2.417544,0.380178,0.779108,0.576107,1.842491,8.064114
13091,0.676814,0.537819,0.513512,1.189653,1.053124,1.265705,0.337089,1.374985,6.948699
...,...,...,...,...,...,...,...,...,...
13345,2.118301,0.294695,0.174478,1.568563,0.290399,0.564368,0.550692,0.906332,6.467828
11465,3.268416,0.320159,0.183029,1.643962,0.307964,0.610769,0.581213,1.168536,8.084048
4424,0.777653,0.341394,0.219417,1.553533,0.348215,0.601772,0.529037,0.824878,5.195899
17965,1.573796,0.328585,0.249756,5.193611,0.297520,0.758380,1.296964,1.719806,11.418418


In [14]:
y_val

6383      2.995000
10019     6.083214
8380      4.624286
14471     9.088077
13091     4.519737
           ...    
13345     6.867500
11465    11.419600
4424      1.993077
17965    14.831923
17246     8.700000
Name: z, Length: 5787, dtype: float64

In [15]:
r2_score(y_val, df_predicts['predict'])

0.525624424961287