# Train model and score on annotations

### This notebook uses annotations to make a pandas dataframe to train and validate a model on.

In [11]:
%matplotlib notebook
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import imblearn
import json

In [12]:
# Read in the premade data, see the data preparation plotting notebook
df_scaled = pd.read_csv("../../../data/annotations/annotations_pixel_dataframes/annotaties_coepelduynen_to_pixel_scaled.csv").drop(['Unnamed: 0'],axis=1)

In [13]:
# Check for colinearity
df_scaled[['r','g','b','i','ndvi','height']].corr()
# This indicates that we will filter r and g

Unnamed: 0,r,g,b,i,ndvi,height
r,1.0,0.977106,0.921585,0.547124,-0.481963,-0.175939
g,0.977106,1.0,0.964362,0.604075,-0.438349,-0.239864
b,0.921585,0.964362,1.0,0.523078,-0.490099,-0.269986
i,0.547124,0.604075,0.523078,1.0,0.363079,-0.074711
ndvi,-0.481963,-0.438349,-0.490099,0.363079,1.0,0.170164
height,-0.175939,-0.239864,-0.269986,-0.074711,0.170164,1.0


In [14]:
df_scaled.groupby(['label']).mean()

Unnamed: 0_level_0,r,g,b,i,ndvi,height,date
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Asfalt,0.399773,0.334474,0.28399,0.165628,0.651612,0.02531,20207080.0
Bos,0.282251,0.202853,0.146965,0.212731,0.831874,0.563899,20203250.0
Gras,0.303406,0.246729,0.200362,0.222265,0.814862,0.007948,20205460.0
Schaduw,0.350915,0.247807,0.183875,0.122517,0.576594,0.088708,20209600.0
Struweel,0.301044,0.221487,0.159558,0.217912,0.824665,0.241186,20203990.0
Vochtige duinvallei,0.308156,0.248914,0.19165,0.207573,0.799614,0.0,20207130.0
Zand,0.504575,0.493844,0.484533,0.273364,0.730527,0.008013,20207940.0


In [11]:
sample_dates = df_scaled['date'].unique()

In [12]:
sample_dates

array([20190601, 20200625, 20200731, 20200915, 20210709, 20210815,
       20210907, 20220515, 20220922], dtype=int64)

### Make a test set based on the date sampling instead of random sampling

We do this because we see a difference in model performance if a model has already seen a annotated satellite image, it makes it easier to predict it even if it is sampled.

With random sampling we will get samples from all the annotated satellite images yet in a production environment the model has to predict images which has not yet been annotated.

Each satellite image has it's own unique rgb values, because of atmospheric influence, thus the model needs to handle these unique values.
Thus it is a better a idea to make a test set by sampling dates instead of random sampling.


Currently we have 12 images we each around 120937 pixels, so we have a large amount of pixels but we need more across dates than in one satellite image.

Synthetic data might be the solution for this.

In [15]:
df_scaled['date'].value_counts().mean()

120936.33333333333

In [16]:
df_scaled['date'].unique()

array([20190601, 20200625, 20200731, 20200915, 20210709, 20210815,
       20210907, 20220515, 20220922], dtype=int64)

In [17]:
def cross_validation_balance_on_date(data, model, cv, no_images_to_remove = 2):
    """
    This method does cross validation based on dates instead of sampling.
        
    @param data: pandas DataFrame with a date column.
    @param model: a model with making predictions.
    @param cv: The number of folds.
    """

    results = []

    # sample values for the folds
    sample_dates = pd.Series(data['date'].unique()).sample(cv*no_images_to_remove).values

    sample_dates = [(sample_dates[i], sample_dates[i+1]) for i in range(0, len(sample_dates)-1, no_images_to_remove)]

    for fold in range(0, cv):
        print("---------fold: "+ str(fold+1))

        
        print("Picked dates: ")
        print(sample_dates[fold][0])
        print(sample_dates[fold][1])

        # Use a dates to make a test set.
        df_scaled_training = data[ (data['date'] != sample_dates[fold][0]) & (data['date'] != sample_dates[fold][1])]
        df_scaled_test = data[(data['date'] == sample_dates[fold][0]) | (data['date'] == sample_dates[fold][1])]

        print("Rebalanced dataset")
        # Rebalance dataset.
        oversample = imblearn.over_sampling.SMOTE()
        X_new, y_new = oversample.fit_resample(df_scaled_training[['r','g','b','i','ndvi','height']], df_scaled_training['label'])
        X_new['label'] = y_new
        df_scaled_balanced = X_new

        print("Fitting model")
        model.fit(df_scaled_balanced[['b','i','ndvi','height']].values, df_scaled_balanced['label'].values)

        print("Results for fold: "+str(fold+1))
        #print("-----------------Training set---------------")
        train =  metrics.classification_report(df_scaled_balanced['label'].values, model.predict(df_scaled_balanced[['b','i','ndvi','height']].values), output_dict=True)
        #print("-----------------Test set---------------")
        test = metrics.classification_report(df_scaled_test['label'].values,model.predict(df_scaled_test[['b','i','ndvi','height']].values), output_dict=True) 
        results.append([fold+1,{"train": train},{"test":  test}])
    
    return results
        

In [18]:
sample_dates = df_scaled['date'].sample(2).values

In [19]:
# Gridsearch parameters, this section can be skipped the optimal parameters are below

#Pick a date to be removed.
df_scaled_training = df_scaled[ (df_scaled['date'] != sample_dates[0]) & (df_scaled['date'] != sample_dates[1])]
df_scaled_test = df_scaled[(df_scaled['date'] == sample_dates[0]) | (df_scaled['date'] == sample_dates[1])]

# Oversample        
oversample = imblearn.over_sampling.SMOTE()
X_new, y_new = oversample.fit_resample(df_scaled_training[['r','g','b','i','ndvi','height']], df_scaled_training['label'])
X_new['label'] = y_new
df_scaled_balanced = X_new

# Number of trees in random forest
n_estimators = [10,20,100,200,300]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10,20,None]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True,False]
criterion = ['gini','entropy', 'log_loss']

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(df_scaled_balanced[['b','i','ndvi','height']].values, df_scaled_balanced['label'].values)
json = json.dumps(rf_random.best_params_)
f = open("best_params.json","w")
# write json object to file
f.write(json)

# close file
f.close()

{'n_estimators': [10, 20, 100, 200, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, None, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(


In [20]:
#Best parameters found after cross validation.
model = RandomForestClassifier(n_estimators=20, min_samples_split=2, min_samples_leaf=4, max_features='auto', bootstrap=False, max_depth=None)

In [21]:
# Final parameters, final parameters which the best implementation performance vs model performance.
model = RandomForestClassifier(n_estimators=10, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', bootstrap=True)

In [21]:
df_scaled

Unnamed: 0,r,g,b,i,ndvi,height,label,image,date
0,0.316497,0.275428,0.255914,0.249149,0.852564,0.396078,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
1,0.318109,0.276328,0.258065,0.245319,0.846154,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
2,0.317571,0.276328,0.257527,0.243617,0.839744,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
3,0.318109,0.277228,0.257527,0.243830,0.839744,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
4,0.321333,0.280828,0.262366,0.249787,0.846154,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
...,...,...,...,...,...,...,...,...,...
1088422,0.503863,0.455155,0.383097,0.276638,0.709402,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1088423,0.503004,0.454124,0.382668,0.275645,0.709402,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1088424,0.497425,0.446392,0.374517,0.265056,0.692308,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1088425,0.494421,0.442268,0.367653,0.257445,0.683761,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922


In [22]:
results = cross_validation_balance_on_date(df_scaled, model, 4)

---------fold: 1
Picked dates: 
20190601
20210815
Rebalanced dataset
Fitting model


  warn(


Results for fold: 1
---------fold: 2
Picked dates: 
20210709
20190601
Rebalanced dataset
Fitting model


  warn(


Results for fold: 2
---------fold: 3
Picked dates: 
20210815
20200915
Rebalanced dataset
Fitting model


  warn(


Results for fold: 3
---------fold: 4
Picked dates: 
20190601
20220515
Rebalanced dataset
Fitting model


  warn(


Results for fold: 4


In [91]:
# Summarize the cross validation on the none grid search parameters
for big_keys  in results[0][2]['test'].keys():
    print(big_keys)

    try:
        avg_keys = {key:0 for key in results[x][2]['test'][big_keys].keys()}

        for x in range(0,4):
            for key in avg_keys.keys():
                avg_keys[key] = avg_keys[key]+results[x][2]['test'][big_keys][key] 

        for key in avg_keys.keys():
            avg_keys[key] = avg_keys[key]/4
    except Exception as e:
        print(e)

    print(avg_keys)

Asfalt
{'precision': 0.30230889134957917, 'recall': 0.5969436207400559, 'f1-score': 0.39548733540969083, 'support': 8254.0}
Bos
{'precision': 0.9187299848347934, 'recall': 0.8278336901511142, 'f1-score': 0.8686328856664004, 'support': 45733.75}
Gras
{'precision': 0.9840236499685466, 'recall': 0.8995279767120067, 'f1-score': 0.9388400307229717, 'support': 180641.5}
Schaduw
{'precision': 0.06049545046814816, 'recall': 0.041587821809609085, 'f1-score': 0.04612147686267638, 'support': 1432.25}
Struweel
{'precision': 0.5098833007374699, 'recall': 0.6922744110710222, 'f1-score': 0.5781478650721434, 'support': 15469.0}
Vochtige duinvallei
{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 13011.5}
Zand
{'precision': 0.9140217890182544, 'recall': 0.9547989950296234, 'f1-score': 0.9322872906771277, 'support': 36280.0}
accuracy
'float' object has no attribute 'keys'
{'precision': 0.9140217890182544, 'recall': 0.9547989950296234, 'f1-score': 0.9322872906771277, 'support': 36280.0}
macr

In [33]:
# Summarize the cross validation on the grid search parameters on the train set.
for big_keys  in results[0][1]['train'].keys():
    print(big_keys)

    try:
        avg_keys = {key:0 for key in results[0][1]['train'][big_keys].keys()}

        for x in range(0,4):
            for key in avg_keys.keys():
                avg_keys[key] = avg_keys[key]+results[0][1]['train'][big_keys][key] 

        for key in avg_keys.keys():
            avg_keys[key] = avg_keys[key]/4
    except Exception as e:
        print(e)

    print(avg_keys)

Asfalt
{'precision': 0.996524694227594, 'recall': 0.9964799681978912, 'f1-score': 0.9965023307108829, 'support': 311929.0}
Bos
{'precision': 0.9991440752956319, 'recall': 0.9954476820045587, 'f1-score': 0.9972924535573884, 'support': 311929.0}
Gras
{'precision': 0.9990909966916134, 'recall': 0.9971724334704372, 'f1-score': 0.9981307931373634, 'support': 311929.0}
Schaduw
{'precision': 0.9994232377510117, 'recall': 0.9999294711296481, 'f1-score': 0.9996762903515294, 'support': 311929.0}
Struweel
{'precision': 0.9951585294192791, 'recall': 0.9989837430953838, 'f1-score': 0.9970674674469688, 'support': 311929.0}
Vochtige duinvallei
{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 311929.0}
Zand
{'precision': 0.9955433039101745, 'recall': 0.9968518476961103, 'f1-score': 0.9961971460975093, 'support': 311929.0}
accuracy
'float' object has no attribute 'keys'
{'precision': 0.9955433039101745, 'recall': 0.9968518476961103, 'f1-score': 0.9961971460975093, 'support': 311929.0}
macr

In [26]:
# Summarize the cross validation on the grid search parameters on the test set
for big_keys  in results[0][2]['test'].keys():
    print(big_keys)

    try:
        avg_keys = {key:0 for key in results[0][2]['test'][big_keys].keys()}

        for x in range(0,4):
            for key in avg_keys.keys():
                avg_keys[key] = avg_keys[key]+results[x][2]['test'][big_keys][key] 

        for key in avg_keys.keys():
            avg_keys[key] = avg_keys[key]/4
    except Exception as e:
        print(e)

    print(avg_keys)

Asfalt
{'precision': 0.3432880191214793, 'recall': 0.4752155196069843, 'f1-score': 0.3907049020449952, 'support': 5274.75}
Bos
{'precision': 0.8486721310536662, 'recall': 0.9187249989323708, 'f1-score': 0.8816467593316644, 'support': 37101.75}
Gras
{'precision': 0.9911619851846193, 'recall': 0.9724602814351372, 'f1-score': 0.98166567171887, 'support': 306133.5}
Schaduw
{'precision': 0.048837708785181566, 'recall': 0.07247725392886684, 'f1-score': 0.05676587358126265, 'support': 477.0}
Struweel
{'precision': 0.5555030013002247, 'recall': 0.56776793980361, 'f1-score': 0.5555622775534989, 'support': 14561.75}
Vochtige duinvallei
{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 10222.75}
Zand
{'precision': 0.9560495100746687, 'recall': 0.9323920512337888, 'f1-score': 0.9439110333377736, 'support': 32221.75}
accuracy
'float' object has no attribute 'keys'
{'precision': 0.9560495100746687, 'recall': 0.9323920512337888, 'f1-score': 0.9439110333377736, 'support': 32221.75}
macro a

### Test set with random sampling

In [61]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[['b','i','ndvi','height']].values, df_scaled['label'].values,test_size=0.3)

In [62]:
X_test

array([[0.13256484, 0.15542245, 0.77124183, 0.24705882],
       [0.129683  , 0.20554855, 0.85620915, 0.00392157],
       [0.17167669, 0.23324742, 0.83333333, 0.00392157],
       ...,
       [0.13894456, 0.1810567 , 0.78846154, 0.00392157],
       [0.41282565, 0.25869845, 0.71794872, 0.00392157],
       [0.23972603, 0.22368421, 0.80519481, 0.00392157]])

In [64]:
X_test

array([[0.13256484, 0.15542245, 0.77124183, 0.24705882],
       [0.129683  , 0.20554855, 0.85620915, 0.00392157],
       [0.17167669, 0.23324742, 0.83333333, 0.00392157],
       ...,
       [0.13894456, 0.1810567 , 0.78846154, 0.00392157],
       [0.41282565, 0.25869845, 0.71794872, 0.00392157],
       [0.23972603, 0.22368421, 0.80519481, 0.00392157]])

In [65]:
# Small width random forest
print(metrics.classification_report(y_test,model.predict(X_test)))

                     precision    recall  f1-score   support

             Asfalt       0.60      0.91      0.72      6904
                Bos       0.93      0.96      0.94     35389
               Gras       1.00      0.97      0.98    210898
            Schaduw       0.64      0.58      0.61      1196
           Struweel       0.79      0.85      0.82     14112
Vochtige duinvallei       1.00      1.00      1.00     16208
               Zand       0.98      0.99      0.99     41822

           accuracy                           0.97    326529
          macro avg       0.85      0.89      0.87    326529
       weighted avg       0.97      0.97      0.97    326529



In [None]:
print(metrics.classification_report(y_test,rf_random.predict(X_test)))

In [35]:
model.predict([df_scaled[['r','g','b','i','ndvi','height']].values[0]])

array(['Gras'], dtype=object)

In [33]:
print(metrics.classification_report(y_test,model.predict(X_test)))

                     precision    recall  f1-score   support

             Asfalt       0.97      0.93      0.95      3263
                Bos       0.99      0.98      0.98     33580
               Gras       0.99      0.99      0.99     99654
          Laag Gras       0.99      0.99      0.99     87005
            Schaduw       0.98      0.96      0.97       906
           Struweel       0.93      0.95      0.94     11268
               Zand       0.99      0.99      0.99     33614
vochtige duinvallei       0.99      0.98      0.98     29736

           accuracy                           0.99    299026
          macro avg       0.98      0.97      0.98    299026
       weighted avg       0.99      0.99      0.99    299026



In [36]:
len(model.predict(df_scaled[['r','g','b','i','ndvi','height']]))

  f"X has feature names, but {self.__class__.__name__} was fitted without"


996753

In [37]:
confusion_matrix = metrics.confusion_matrix(model.predict(X_test), y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = model.classes_)

cm_display.plot()
plt.savefig('confusion_matrix.pdf')

<IPython.core.display.Javascript object>

In [19]:
#model = RandomForestClassifier(criterion="entropy", max_features=3)
print(cross_val_score(model,df_scaled[['r','g','b','i','ndvi','height']].values,df_scaled['label'].values, cv=4))
#model = model.fit(X_train,y_train)

  warn(
  warn(
  warn(
  warn(


[0.68343912 0.9536825  0.83569978 0.6163491 ]


# Export Random Forest model.

In [38]:
filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_2019_2022.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_grid_search_2019_2022.sav'
pickle.dump(rf_random, open(filename, 'wb'))

In [None]:
oversample = imblearn.over_sampling.SMOTE()
X_new, y_new = oversample.fit_resample(df_scaled[['r','g','b','i','ndvi','height']], df_scaled['label'])
X_new['label'] = y_new
df_scaled_balanced = X_new

In [38]:
# Train the model on the whole balanced dataset and export the model.
model = RandomForestClassifier(n_estimators=10, min_samples_split=5, min_samples_leaf=1, max_features='auto', bootstrap=False)
model = model.fit(df_scaled_balanced[['r','g','b','i','ndvi','height']].values, df_scaled_balanced['label'].values)

filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_grid_search_all_data_2019_2022_small_balanced_v1.3.sav'
pickle.dump(model, open(filename, 'wb'))

  warn(


In [None]:
# Train the model on the whole dataset and export the model.
model = RandomForestClassifier(n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', bootstrap=False)
model = model.fit(df_scaled[['r','g','b','i','ndvi','height']].values, df_scaled['label'].values)

filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_grid_search_all_data_2019_2022.sav'
pickle.dump(model, open(filename, 'wb'))

# Deep Learning model

In [47]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [82]:
def cross_validation_balance_on_date_keras(data,cv=4,no_images_to_remove=2):

    """
    This method does cross validation based on dates instead of sampling.
        
    @param data: pandas DataFrame with a date column.
    @param model: a model with making predictions.
    @param cv: The number of folds.
    """

    results = []

    # sample values for the folds
    sample_dates = pd.Series(data['date'].unique()).sample(cv*no_images_to_remove).values

    sample_dates = [(sample_dates[i], sample_dates[i+1]) for i in range(0, len(sample_dates)-1, no_images_to_remove)]

    for fold in range(0, cv):
        print("---------fold: "+ str(fold+1))

        
        print("Picked dates: ")
        print(sample_dates[fold][0])
        print(sample_dates[fold][1])

        # Use a dates to make a test set.
        df_scaled_training = data[ (data['date'] != sample_dates[fold][0]) & (data['date'] != sample_dates[fold][1])]
        df_scaled_test = data[(data['date'] == sample_dates[fold][0]) | (data['date'] == sample_dates[fold][1])]
        df_scaled_test['label_num'] = label_encoder.fit_transform(df_scaled_test['label'])

        print("Rebalanced dataset")
        # Rebalance dataset.
        oversample = imblearn.over_sampling.SMOTE()
        X_new, y_new = oversample.fit_resample(df_scaled_training[['r','g','b','i','ndvi','height']], df_scaled_training['label'])
        X_new['label'] = y_new
        df_scaled_balanced = X_new

        df_scaled_balanced['label_num'] = label_encoder.fit_transform(df_scaled_balanced['label'])
        label_encoder_fitted = label_encoder.fit(df_scaled_balanced['label'] )

        model = get_model()

        print("Fitting model")
        

        history = model.fit(df_scaled_balanced[['b','i','ndvi','height']].values.reshape(len(df_scaled_balanced),1,1,4), df_scaled_balanced['label_num'].values,\
                            epochs=10,\
                            validation_data=(df_scaled_test[['b','i','ndvi','height']].values.reshape(len(df_scaled_test),1,1,4), df_scaled_test['label_num'].values))

        print("Predicting test")
        predictions =[ np.argmax(prediction) for prediction in model.predict(df_scaled_test[['b','i','ndvi','height']].values.reshape(len(df_scaled_test),1,1,4))]

        print("Results for fold: "+str(fold+1))
        #print("-----------------Training set---------------")
        #train =  metrics.classification_report(df_scaled_balanced['label'].values, model.predict(df_scaled_balanced[['b','i','ndvi','height']].values), output_dict=True)
        #print("-----------------Test set---------------")
        test = metrics.classification_report(label_encoder_fitted.inverse_transform(df_scaled_test['label_num'].values), label_encoder_fitted.inverse_transform(predictions), output_dict=True)
        results.append([fold+1,{"test":  test}])
    
    return results

In [84]:
def get_model():

    model = keras.models.Sequential()
    model.add(layers.Conv2D(4, (1, 1), activation='relu', input_shape=(1,1,4)))
    model.add(layers.MaxPooling2D((1, 1)))
    model.add(layers.Conv2D(12, (1, 1), activation='relu'))
    model.add(layers.MaxPooling2D((1, 1)))
    model.add(layers.Conv2D(12, (1, 1), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(12, activation='relu'))
    model.add(layers.Dense(len(df_scaled['label'].unique())))
    model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])
    return model

In [85]:
results = cross_validation_balance_on_date_keras(df_scaled)

---------fold: 1
Picked dates: 
20220922
20200915
Rebalanced dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scaled_test['label_num'] = label_encoder.fit_transform(df_scaled_test['label'])


Fitting model
Epoch 1/10

In [65]:
predictions =[ np.argmax(prediction) for prediction in model.predict(X_test.reshape(X_test.shape[0],1,1,4))]



In [69]:
metrics.classification_report(label_encoder_fitted.inverse_transform(y_test), label_encoder_fitted.inverse_transform(predictions))

'                     precision    recall  f1-score   support\n\n             Asfalt       0.76      0.64      0.70      6750\n                Bos       0.97      0.92      0.94     35464\n               Gras       0.99      0.99      0.99    210964\n            Schaduw       0.56      0.39      0.46      1133\n           Struweel       0.77      0.85      0.81     14039\nVochtige duinvallei       1.00      1.00      1.00     16445\n               Zand       0.97      0.98      0.98     41734\n\n           accuracy                           0.97    326529\n          macro avg       0.86      0.83      0.84    326529\n       weighted avg       0.97      0.97      0.97    326529\n'

In [None]:
model = tf.Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu',input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation=Activation(tf.nn.softmax)))

# Check contrast model.

In [32]:
from nso_ds_classes.nso_ds_models import cluster_scaler_BNDVIH_model
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
confusion_matrix = metrics.confusion_matrix(df_scaled[df_scaled['label'] != "Pad"]['label'].values, df_scaled[df_scaled['label'] != "Pad"]['predict'].values)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels=df_scaled[df_scaled['label'] != "Pad"]['predict'].unique())

cm_display.plot()
plt.show()

In [34]:
cluster_centers_file = "./cluster_centers/normalized_5_BHNDVI_cluster_centers_dunes.csv"

In [35]:
a_cluster_annotations_stats_model = cluster_scaler_BNDVIH_model(cluster_centers_file)

In [36]:
a_cluster_annotations_stats_model.get_class_label(a_cluster_annotations_stats_model.predict([df[['r','g','b','i','ndvi','height']].values[0]]))

'Bos'

In [27]:
df_scaled

Unnamed: 0,r,g,b,i,ndvi,height,label,image,date
0,0.316497,0.275428,0.255914,0.249149,0.852564,0.396078,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
1,0.318109,0.276328,0.258065,0.245319,0.846154,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
2,0.317571,0.276328,0.257527,0.243617,0.839744,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
3,0.318109,0.277228,0.257527,0.243830,0.839744,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
4,0.321333,0.280828,0.262366,0.249787,0.846154,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
...,...,...,...,...,...,...,...,...,...
1047980,0.720601,0.726804,0.682969,0.375248,0.683761,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047981,0.730901,0.738144,0.696268,0.390470,0.692308,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047982,0.736910,0.743814,0.702703,0.399404,0.700855,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047983,0.724034,0.726804,0.682969,0.382859,0.692308,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922


In [37]:
df_scaled['predict'] = df_scaled.apply(lambda x: a_cluster_annotations_stats_model.get_class_label(a_cluster_annotations_stats_model.predict([x[['r','g','b','i','ndvi','height']]])), axis=1)

In [38]:
df_scaled['predict'].unique()

array(['Bos', 'Gras', 'Struweel', 'Laag gras', 'Asfalt', 'Zand'],
      dtype=object)

In [39]:
print(metrics.classification_report(df_scaled['label'], df_scaled['predict']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

             Asfalt       0.35      0.70      0.47     18263
                Bos       1.00      0.81      0.90    111919
               Gras       0.82      0.90      0.86    335515
          Laag Gras       0.00      0.00      0.00    312840
          Laag gras       0.00      0.00      0.00         0
            Schaduw       0.00      0.00      0.00      3866
           Struweel       0.56      0.87      0.68     39586
               Zand       0.84      0.96      0.89    126996
vochtige duinvallei       0.00      0.00      0.00     99000

           accuracy                           0.54   1047985
          macro avg       0.40      0.47      0.42   1047985
       weighted avg       0.50      0.54      0.51   1047985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
