# Train model and score on annotations

### This notebook uses annotations to make a pandas dataframe to train and validate a model on.

In [1]:
%matplotlib notebook
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import imblearn

In [2]:
# Read in the premade data, see the data preparation plotting notebook
df_scaled = pd.read_csv("../../../data/annotations/annotations_pixel_dataframes/annotaties_coepelduynen_to_pixel_scaled.csv").drop(['Unnamed: 0'],axis=1)

In [3]:
# Check for colinearity
df_scaled[['r','g','b','i','ndvi','height']].corr()
# This indicates that we will filter r and g

Unnamed: 0,r,g,b,i,ndvi,height
r,1.0,0.977106,0.921585,0.547124,-0.481963,-0.175939
g,0.977106,1.0,0.964362,0.604075,-0.438349,-0.239864
b,0.921585,0.964362,1.0,0.523078,-0.490099,-0.269986
i,0.547124,0.604075,0.523078,1.0,0.363079,-0.074711
ndvi,-0.481963,-0.438349,-0.490099,0.363079,1.0,0.170164
height,-0.175939,-0.239864,-0.269986,-0.074711,0.170164,1.0


### Make a test set based on the date sampling instead of random sampling

We do this because we see a difference in model performance if a model has already seen a annotated satellite image, it makes it easier to predict it even if it is sampled.

With random sampling we will get samples from all the annotated satellite images yet in a production environment the model has to predict images which has not yet been annotated.

Each satellite image has it's own unique rgb values, because of atmospheric influence, thus the model needs to handle these unique values.
Thus it is a better a idea to make a test set by sampling dates instead of random sampling.


Currently we have 12 images we each around 120937 pixels, so we have a large amount of pixels but we need more across dates than in one satellite image.

Synthetic data might be the solution for this.

In [17]:
df_scaled['date'].value_counts().mean()

120936.33333333333

In [4]:
df_scaled['date'].unique()

array([20190601, 20200625, 20200731, 20200915, 20210709, 20210815,
       20210907, 20220515, 20220922], dtype=int64)

In [56]:
def cross_validation_balance_on_date(data, model, cv):
    """
    This method does cross validation based on dates instead of sampling.
        
    @param data: pandas DataFrame with a date column.
    @param model: a model with making predictions.
    @param cv: The number of folds.
    """

    results = []

    for fold in range(0, cv):
        print("---------fold: "+ str(fold+1))

        sample_dates = data['date'].sample(2).values
        print("Picked dates: ")
        print(sample_dates)

        # Use a dates to make a test set.
        df_scaled_training = data[ (data['date'] != sample_dates[0]) & (data['date'] != sample_dates[1])]
        df_scaled_test = data[(data['date'] == sample_dates[0]) | (data['date'] == sample_dates[1])]

        print("Rebalanced dataset")
        # Rebalance dataset.
        oversample = imblearn.over_sampling.SMOTE()
        X_new, y_new = oversample.fit_resample(df_scaled_training[['r','g','b','i','ndvi','height']], df_scaled_training['label'])
        X_new['label'] = y_new
        df_scaled_balanced = X_new

        print("Fitting model")
        model.fit(df_scaled_balanced[['b','i','ndvi','height']].values, df_scaled_balanced['label'].values)

        print("Results for fold: "+str(fold+1))
        #print("-----------------Training set---------------")
        train =  metrics.classification_report(df_scaled_balanced['label'].values, model.predict(df_scaled_balanced[['b','i','ndvi','height']].values), output_dict=True)
        #print("-----------------Test set---------------")
        test = metrics.classification_report(df_scaled_test['label'].values,model.predict(df_scaled_test[['b','i','ndvi','height']].values), output_dict=True) 
        results.append([fold+1,train, test])
    
    return results
        

In [87]:
# Gridsearch parameters, this section can be skipped the optimal parameters are below


# Number of trees in random forest
n_estimators = [10,20,100,200,300]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10,20,None]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True,False]
criterion = ['gini','entropy', 'log_loss']

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

In [31]:
# Final parameters, final parameters which the best implementation performance vs model performance.
model = RandomForestClassifier(n_estimators=10, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', bootstrap=True)

In [34]:
df_scaled

Unnamed: 0,r,g,b,i,ndvi,height,label,image,date
0,0.316497,0.275428,0.255914,0.249149,0.852564,0.396078,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
1,0.318109,0.276328,0.258065,0.245319,0.846154,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
2,0.317571,0.276328,0.257527,0.243617,0.839744,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
3,0.318109,0.277228,0.257527,0.243830,0.839744,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
4,0.321333,0.280828,0.262366,0.249787,0.846154,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
...,...,...,...,...,...,...,...,...,...
1088422,0.503863,0.455155,0.383097,0.276638,0.709402,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1088423,0.503004,0.454124,0.382668,0.275645,0.709402,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1088424,0.497425,0.446392,0.374517,0.265056,0.692308,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1088425,0.494421,0.442268,0.367653,0.257445,0.683761,0.000000,Vochtige duinvallei,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922


In [57]:
results = cross_validation_balance_on_date(df_scaled, model, 4)

---------fold: 1
Picked dates: 
[20210815 20210907]
Rebalanced dataset


In [None]:
results

### Test set with random sampling

In [31]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[['b','i','ndvi','height']].values, df_scaled['label'].values,test_size=0.3)

In [35]:
X_test

array([[0.34531773, 0.26785714, 0.22334294, 0.22131148, 0.81699346,
        0.02352941],
       [0.30016722, 0.20304233, 0.15634006, 0.14564943, 0.73202614,
        0.51372549],
       [0.37126138, 0.30917327, 0.19615602, 0.2034746 , 0.77702703,
        0.00392157],
       ...,
       [0.27189683, 0.20882088, 0.19354839, 0.15085106, 0.74358974,
        0.00392157],
       [0.35396518, 0.27525253, 0.20376712, 0.24619114, 0.85714286,
        0.41568627],
       [0.24149433, 0.2012945 , 0.14696059, 0.21488402, 0.82692308,
        0.00392157]])

In [33]:
# Small width random forest
print(metrics.classification_report(y_test,model.predict(X_test[['b','i','ndvi','height']])))

  print(metrics.classification_report(y_test,model.predict(X_test[['b','i','ndvi','height']])))


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
print(metrics.classification_report(y_test,rf_random.predict(X_test)))

In [35]:
model.predict([df_scaled[['r','g','b','i','ndvi','height']].values[0]])

array(['Gras'], dtype=object)

In [33]:
print(metrics.classification_report(y_test,model.predict(X_test)))

                     precision    recall  f1-score   support

             Asfalt       0.97      0.93      0.95      3263
                Bos       0.99      0.98      0.98     33580
               Gras       0.99      0.99      0.99     99654
          Laag Gras       0.99      0.99      0.99     87005
            Schaduw       0.98      0.96      0.97       906
           Struweel       0.93      0.95      0.94     11268
               Zand       0.99      0.99      0.99     33614
vochtige duinvallei       0.99      0.98      0.98     29736

           accuracy                           0.99    299026
          macro avg       0.98      0.97      0.98    299026
       weighted avg       0.99      0.99      0.99    299026



In [36]:
len(model.predict(df_scaled[['r','g','b','i','ndvi','height']]))

  f"X has feature names, but {self.__class__.__name__} was fitted without"


996753

In [37]:
confusion_matrix = metrics.confusion_matrix(model.predict(X_test), y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = model.classes_)

cm_display.plot()
plt.savefig('confusion_matrix.pdf')

<IPython.core.display.Javascript object>

In [19]:
#model = RandomForestClassifier(criterion="entropy", max_features=3)
print(cross_val_score(model,df_scaled[['r','g','b','i','ndvi','height']].values,df_scaled['label'].values, cv=4))
#model = model.fit(X_train,y_train)

  warn(
  warn(
  warn(
  warn(


[0.68343912 0.9536825  0.83569978 0.6163491 ]


# Export Random Forest model.

In [38]:
filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_2019_2022.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_grid_search_2019_2022.sav'
pickle.dump(rf_random, open(filename, 'wb'))

In [None]:
oversample = imblearn.over_sampling.SMOTE()
X_new, y_new = oversample.fit_resample(df_scaled[['r','g','b','i','ndvi','height']], df_scaled['label'])
X_new['label'] = y_new
df_scaled_balanced = X_new

In [38]:
# Train the model on the whole balanced dataset and export the model.
model = RandomForestClassifier(n_estimators=10, min_samples_split=5, min_samples_leaf=1, max_features='auto', bootstrap=False)
model = model.fit(df_scaled_balanced[['r','g','b','i','ndvi','height']].values, df_scaled_balanced['label'].values)

filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_grid_search_all_data_2019_2022_small_balanced_v1.3.sav'
pickle.dump(model, open(filename, 'wb'))

  warn(


In [None]:
# Train the model on the whole dataset and export the model.
model = RandomForestClassifier(n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', bootstrap=False)
model = model.fit(df_scaled[['r','g','b','i','ndvi','height']].values, df_scaled['label'].values)

filename = './models/randomforest_classifier_coepelduynen_contrast_annotations_grid_search_all_data_2019_2022.sav'
pickle.dump(model, open(filename, 'wb'))

# Deep Learning model

In [62]:
import tensorflow as tf

import tensorflow_datasets as tfds
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

ModuleNotFoundError: No module named 'tensorflow_datasets'

In [56]:
df_scaled['label_num'] = label_encoder.fit_transform(df_scaled['label'])

In [72]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[['r','g','b','i','ndvi','height']].values, df_scaled['label_num'].values,test_size=0.3)

In [40]:
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

# Normalize pixel values to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0

In [41]:
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i])
    # The CIFAR labels happen to be arrays, 
    # which is why you need the extra index
    plt.xlabel(class_names[train_labels[i][0]])
plt.show()

<IPython.core.display.Javascript object>

In [59]:
df_scaled[['r','g','b','i','height']].values.shape

(1046864, 5)

In [103]:
model = models.Sequential()
model.add(layers.Conv2D(6, (1, 1), activation='relu', input_shape=(1,1,6)))
model.add(layers.MaxPooling2D((1, 1)))
model.add(layers.Conv2D(12, (1, 1), activation='relu'))
model.add(layers.MaxPooling2D((1, 1)))
model.add(layers.Conv2D(12, (1, 1), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(10))

In [104]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [95]:
X_train = X_train.reshape(732804,1,1,6)

In [101]:
len(y_train)

732804

In [116]:


history = model.fit(X_train.reshape(732804,1,1,6), y_train, epochs=10, 
                    validation_data=(X_test.reshape(314060,1,1,6), y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [119]:
len(X_train)

732804

In [139]:
predictions =[ np.argmax(prediction) for prediction in model.predict(X_test.reshape(X_test.shape[0],1,1,6))]

In [140]:
metrics.classification_report(label_encoder.inverse_transform(y_test),label_encoder.inverse_transform(predictions))

'                     precision    recall  f1-score   support\n\n             Asfalt       0.79      0.71      0.75      5565\n                Bos       0.98      0.93      0.95     33581\n               Gras       0.97      0.98      0.97    100366\n          Laag Gras       0.96      0.96      0.96     93872\n            Schaduw       0.80      0.66      0.72       843\n           Struweel       0.77      0.85      0.81     11881\n               Zand       0.98      0.98      0.98     38126\nvochtige duinvallei       0.95      0.93      0.94     29826\n\n           accuracy                           0.95    314060\n          macro avg       0.90      0.88      0.89    314060\n       weighted avg       0.95      0.95      0.95    314060\n'

In [135]:
label_encoder.inverse_transform(predictions)

array(['Laag Gras', 'Bos', 'Gras', ..., 'Bos', 'Gras', 'Gras'],
      dtype=object)

In [127]:
y_train[0]

3

In [113]:
X_test.reshape(314060,1,1,6)

array([[[[0.26813541, 0.21287129, 0.18225806, 0.20553191, 0.83974359,
          0.        ]]],


       [[[0.30736163, 0.26147615, 0.24569892, 0.23085106, 0.83333333,
          0.00392157]]],


       [[[0.25083389, 0.21165049, 0.15898464, 0.24323454, 0.8525641 ,
          0.        ]]],


       ...,


       [[[0.22881921, 0.17993528, 0.13360053, 0.19909794, 0.82051282,
          0.00392157]]],


       [[[0.23882588, 0.18899676, 0.14696059, 0.20908505, 0.82051282,
          0.00392157]]],


       [[[0.4516129 , 0.39464068, 0.3573017 , 0.23494724, 0.7       ,
          0.00392157]]]])

In [115]:
model.test_on_batch(X_test.reshape(314060,1,1,6),y_test)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


[0.16105984151363373, 0.945013701915741]

In [27]:
df_scaled

Unnamed: 0,r,g,b,i,ndvi,height,label,image,date
0,0.316497,0.275428,0.255914,0.249149,0.852564,0.396078,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
1,0.318109,0.276328,0.258065,0.245319,0.846154,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
2,0.317571,0.276328,0.257527,0.243617,0.839744,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
3,0.318109,0.277228,0.257527,0.243830,0.839744,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
4,0.321333,0.280828,0.262366,0.249787,0.846154,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
...,...,...,...,...,...,...,...,...,...
1047980,0.720601,0.726804,0.682969,0.375248,0.683761,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047981,0.730901,0.738144,0.696268,0.390470,0.692308,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047982,0.736910,0.743814,0.702703,0.399404,0.700855,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047983,0.724034,0.726804,0.682969,0.382859,0.692308,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922


In [None]:
model = tf.Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu',input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation=Activation(tf.nn.softmax)))

# Check contrast model.

In [32]:
from nso_ds_classes.nso_ds_models import cluster_scaler_BNDVIH_model
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
confusion_matrix = metrics.confusion_matrix(df_scaled[df_scaled['label'] != "Pad"]['label'].values, df_scaled[df_scaled['label'] != "Pad"]['predict'].values)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels=df_scaled[df_scaled['label'] != "Pad"]['predict'].unique())

cm_display.plot()
plt.show()

In [34]:
cluster_centers_file = "./cluster_centers/normalized_5_BHNDVI_cluster_centers_dunes.csv"

In [35]:
a_cluster_annotations_stats_model = cluster_scaler_BNDVIH_model(cluster_centers_file)

In [36]:
a_cluster_annotations_stats_model.get_class_label(a_cluster_annotations_stats_model.predict([df[['r','g','b','i','ndvi','height']].values[0]]))

'Bos'

In [27]:
df_scaled

Unnamed: 0,r,g,b,i,ndvi,height,label,image,date
0,0.316497,0.275428,0.255914,0.249149,0.852564,0.396078,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
1,0.318109,0.276328,0.258065,0.245319,0.846154,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
2,0.317571,0.276328,0.257527,0.243617,0.839744,0.003922,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
3,0.318109,0.277228,0.257527,0.243830,0.839744,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
4,0.321333,0.280828,0.262366,0.249787,0.846154,0.317647,Gras,20190601_105844_SV1-04_50cm_RD_11bit_RGBI_Katw...,20190601
...,...,...,...,...,...,...,...,...,...
1047980,0.720601,0.726804,0.682969,0.375248,0.683761,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047981,0.730901,0.738144,0.696268,0.390470,0.692308,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047982,0.736910,0.743814,0.702703,0.399404,0.700855,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922
1047983,0.724034,0.726804,0.682969,0.382859,0.692308,0.003922,Zand,20220922_110546_SV2-01_SV_RD_11bit_RGBI_50cm_V...,20220922


In [37]:
df_scaled['predict'] = df_scaled.apply(lambda x: a_cluster_annotations_stats_model.get_class_label(a_cluster_annotations_stats_model.predict([x[['r','g','b','i','ndvi','height']]])), axis=1)

In [38]:
df_scaled['predict'].unique()

array(['Bos', 'Gras', 'Struweel', 'Laag gras', 'Asfalt', 'Zand'],
      dtype=object)

In [39]:
print(metrics.classification_report(df_scaled['label'], df_scaled['predict']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

             Asfalt       0.35      0.70      0.47     18263
                Bos       1.00      0.81      0.90    111919
               Gras       0.82      0.90      0.86    335515
          Laag Gras       0.00      0.00      0.00    312840
          Laag gras       0.00      0.00      0.00         0
            Schaduw       0.00      0.00      0.00      3866
           Struweel       0.56      0.87      0.68     39586
               Zand       0.84      0.96      0.89    126996
vochtige duinvallei       0.00      0.00      0.00     99000

           accuracy                           0.54   1047985
          macro avg       0.40      0.47      0.42   1047985
       weighted avg       0.50      0.54      0.51   1047985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
