In [16]:
import pandas as pd
import numpy as np
import joblib as jb
import tifffile
from pathlib import Path
from glob import glob
import matplotlib.pyplot as plt
from satellite_bathymetry.preprocessing import get_coord_from_pixel_pos, get_pixel_from_coord, ndwi, pixel_ndwi, pixel_log_ratio
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

import cv2
from scipy import stats

In [17]:
df_data = jb.load('../data/generated/df_newimages_bands_downside.pkl.z')
df_data.head(3)

Unnamed: 0,x,y,z,b1,b2,b3,b4,b5,b6,b7,b8,b2b4,b3b4,ndwi15,ndwi24,ndwi53,cspmb7
0,233,1130,3.195862,0.1199,0.0866,0.0667,0.0464,0.049,0.0316,0.0283,0.0238,1.162614,1.094573,0.419775,0.302256,-0.152982,23.382784
1,233,1131,3.27303,0.1199,0.088,0.0668,0.0457,0.049,0.0316,0.0283,0.0237,1.171434,1.099318,0.419775,0.31638,-0.153713,23.382784
2,233,1132,3.299687,0.1199,0.0879,0.0666,0.0461,0.0488,0.0324,0.0281,0.0238,1.168473,1.096035,0.421458,0.31194,-0.154246,23.158824


In [18]:
df_features = df_data.drop(['x','y','z','b2b4','b3b4','ndwi15','ndwi24','ndwi53','cspmb7'],axis=1)
df_features.head(3)

Unnamed: 0,b1,b5,b6,b8,cspmb7
0,0.1199,0.049,0.0316,0.0238,23.382784
1,0.1199,0.049,0.0316,0.0237,23.382784
2,0.1199,0.0488,0.0324,0.0238,23.158824


In [19]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(df_features)

In [20]:
pd.Series(kmeans.labels_).value_counts()


0    17049
1     2238
dtype: int64

In [21]:
df_features['cluster'] = kmeans.labels_
df_features.head(3)

Unnamed: 0,b1,b5,b6,b8,cspmb7,cluster
0,0.1199,0.049,0.0316,0.0238,23.382784,0
1,0.1199,0.049,0.0316,0.0237,23.382784,0
2,0.1199,0.0488,0.0324,0.0238,23.158824,0


In [22]:
#min_max_scaler = MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(kmeans.transform(df_features.drop(['cluster'],axis=1)))
#df_dist = pd.DataFrame(x_scaled)
#df_dist.columns = ['norm_dist_to_0','norm_dist_to_1','norm_dist_to_2','norm_dist_to_3']
#df_dist

#min_max_scaler = MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform())
df_dist = pd.DataFrame(kmeans.transform(df_features.drop(['cluster'],axis=1)))
df_dist.columns = ['norm_dist_to_0','norm_dist_to_1']
df_dist['norm_dist_to_0'] = 1/df_dist['norm_dist_to_0']
df_dist['norm_dist_to_1'] = 1/df_dist['norm_dist_to_1']
df_dist['sum'] = df_dist.sum(axis=1)
df_dist['norm_dist_to_0'] = df_dist['norm_dist_to_0']/df_dist['sum']
df_dist['norm_dist_to_1'] = df_dist['norm_dist_to_1']/df_dist['sum']
df_dist.drop(['sum'],axis=1,inplace=True)
df_dist



Unnamed: 0,norm_dist_to_0,norm_dist_to_1
0,0.627049,0.372951
1,0.627049,0.372951
2,0.644275,0.355725
3,0.644275,0.355725
4,0.609780,0.390220
...,...,...
19282,0.652871,0.347129
19283,0.583796,0.416204
19284,0.583796,0.416204
19285,0.583796,0.416204


In [23]:
df_dist.iloc[6383]


norm_dist_to_0    0.940016
norm_dist_to_1    0.059984
Name: 6383, dtype: float64

In [24]:
df_features = pd.concat([df_features,df_dist],axis=1)
df_features

Unnamed: 0,b1,b5,b6,b8,cspmb7,cluster,norm_dist_to_0,norm_dist_to_1
0,0.1199,0.0490,0.0316,0.0238,23.382784,0,0.627049,0.372951
1,0.1199,0.0490,0.0316,0.0237,23.382784,0,0.627049,0.372951
2,0.1199,0.0488,0.0324,0.0238,23.158824,0,0.644275,0.355725
3,0.1199,0.0488,0.0324,0.0232,23.158824,0,0.644275,0.355725
4,0.1196,0.0489,0.0323,0.0238,23.607309,0,0.609780,0.390220
...,...,...,...,...,...,...,...,...
19282,0.1267,0.0475,0.0308,0.0215,23.047057,0,0.652871,0.347129
19283,0.1267,0.0482,0.0295,0.0225,23.945153,0,0.583796,0.416204
19284,0.1267,0.0482,0.0295,0.0232,23.945153,0,0.583796,0.416204
19285,0.1267,0.0482,0.0295,0.0230,23.945153,0,0.583796,0.416204


In [31]:
X_train, X_val, y_train, y_val = train_test_split(df_features, df_data['z'], test_size=0.3, random_state=42)

models = list()
for i in range(2):
    X_train_model = X_train[X_train['cluster'] == i]
    y_train_model = y_train[y_train.index.isin(X_train_model.index)]
    obj = RandomForestRegressor()
    obj.fit(X_train_model.drop(['norm_dist_to_0','norm_dist_to_1','cluster'],axis=1), y_train_model)
    models.append(obj)

In [32]:
predicts = list()
for i in range(2):
    predict = models[i].predict(X_val.drop(['norm_dist_to_0','norm_dist_to_1','cluster'],axis=1))*X_val[f'norm_dist_to_{i}']
    predicts.append(predict)

In [33]:
predicts[0]

6383      3.779367
10019     8.486350
8380      6.172047
14471     7.435511
13091     1.851886
           ...    
13345     5.883622
11465    10.740708
4424      2.829379
17965    15.189405
17246     5.242562
Name: norm_dist_to_0, Length: 5787, dtype: float64

In [34]:
df_predicts = pd.DataFrame(predicts).transpose()
df_predicts['predict'] = df_predicts.sum(axis=1)
df_predicts

Unnamed: 0,norm_dist_to_0,norm_dist_to_1,predict
6383,3.779367,0.266194,4.045561
10019,8.486350,0.522519,9.008869
8380,6.172047,0.145628,6.317675
14471,7.435511,0.481199,7.916710
13091,1.851886,2.562104,4.413991
...,...,...,...
13345,5.883622,0.594262,6.477885
11465,10.740708,0.064786,10.805494
4424,2.829379,0.613140,3.442518
17965,15.189405,0.492844,15.682249


In [35]:
y_val

6383      2.995000
10019     6.083214
8380      4.624286
14471     9.088077
13091     4.519737
           ...    
13345     6.867500
11465    11.419600
4424      1.993077
17965    14.831923
17246     8.700000
Name: z, Length: 5787, dtype: float64

In [36]:
r2_score(y_val, df_predicts['predict'])

0.786506422283785