# Models Creation, training and assessment

This file contains the methods, class and function used to create, tune, train and assess the model

## Import

Import all the useful libraries used in this notebook.

In [None]:
from skimage.io import imread, imshow
from skimage.color import rgb2gray
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import progressbar
import time
from collections import Counter
import math
import itertools
import csv

This cell is to access the Google Drive files when running on google collab and fix the seed to reproduce the results.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Projet/code

Mounted at /content/drive
/content/drive/MyDrive/Projet/code


In [None]:
np.random.seed(22)

# Functions

## Dataset

In [1]:
class Dataset:
  """ A class to save and analyse the dataset
      Attributes
      ----------
          size : int
                size of the dataset
          images : list of numpy array
                list of images of the datset
          cropcodes : list of str
                list of labels of the dataset
          stats : list of numpy array
                list of extracted features of each images of the dataset
          stats_train : list of numpy array
                Extracted features for the training
          stats_test : list of numpy array
                Extracted features for the testinh
          cropcode_train : list of str
                labels for the training
          cropcode_test : list of str
                labels for the testing
          class_names : list of str
                list of the class in the dataset
      Methods
      -------
          print_info():
                plot occurences of the dataset
          separate_train_test(s):
                separate the dataset between training and testing set
          display_train_test():
                plot occurences of the training and testing sets
  """

  def __init__(self, data):
    """
        Parameters
        ----------
        data: dict
            dataset used
    """
    self.size = len(data['image'])
    self.images = []
    self.cropcodes = []
    self.stats = []
    # for each image extract the features and append it in stats list
    for img in self.images:
      img = np.round(img/(2**4),0)
      stat_rgba = [np.nanmean(img[:, :,3]), np.nanstd(img[:, :,3]),
                   np.nanmean(img[:, :,2]), np.nanstd(img[:, :,2]),
                   np.nanmean(img[:, :,1]), np.nanstd(img[:, :,1]),
                   np.nanmean(img[:, :,7]), np.nanstd(img[:, :,7]),
                   getSize(img[:, :, 7])]
      self.stats.append(stat_rgba)
    self.stats = np.asarray(self.stats)/255
    # init empty train and testing set
    self.stats_train = []
    self.stats_test = []
    self.cropcode_train = []
    self.cropcode_test = []
    # get the class names
    copy = self.cropcode
    self.class_names= list(set(copy))

  def print_info(self):
    """ plot occurences of the dataset """
    c = Counter(self.cropcode)
    print("ww = Winter wheat" + '\n' +
          "sb = Spring barley" + '\n' +
          "sw = Spring Wheat" + '\n' +
          "wb = Winter barley" + '\n' +
          "be = Beet (sugar beet / fodder beet)" + '\n' +
          "fb = Field beans" + '\n' +
          "ma = Maize" + '\n' +
          "or = Oilseed rape" + '\n' +
          "po = Potatoes" + '\n' +
          "gr = Grass" + '\n' +
          "ot = Other crops" + '\n' + # i may reject this data
          "pe = Peas" + '\n' +
          "wo = Winter oats")
    key_value = {}
    for i in sorted(c.keys()):
          key_value[i] = c[i]

    plt.bar(key_value.keys(), key_value.values())
    plt.xlabel('Labels')
    plt.ylabel('Occurrence')
    plt.title('Occurrence of crops types')
    plt.show()

  def separate_train_test(self, s):
    """ separate the dataset between training and testing set
        Parameters
        ----------
        s: float
            size of the testing set in percent. s is between 0 and 1
    """
    self.stats_train, self.stats_test, self.cropcode_train, self.cropcode_test = train_test_split(self.stats, self.cropcode, test_size=s)

    print('Size of training set : ' + str(len(self.cropcode_train)) + ' / ' + str(len(self.cropcode)))
    print('Size of testing set : ' + str(len(self.cropcode_test))+ ' / ' + str(len(self.cropcode)))
    self.display_train_test()

  def display_train_test(self):
    """plot occurences of the training and testing sets"""
    test = Counter(self.cropcode_test)
    train = Counter(self.cropcode_train)
    info = "Dataset size " + str(self.size)

    key_value_train = {};
    key_value_test = {};

    for i in sorted(test.keys()):
      key_value_test[i] = test[i]
    for i in sorted(train.keys()):
      key_value_train[i] = train[i]

    p1 = plt.bar(key_value_train.keys(), key_value_train.values(), width=0.5);
    p2 = plt.bar( key_value_test.keys(), key_value_test.values(), width=0.5, bottom=list(key_value_train.values()) );

    plt.legend((p1[0], p2[0]), ('Training set', 'Test set'), loc='lower left')
    plt.xlabel('Labels')
    plt.ylabel('Occurrence')
    plt.title('Occurrence of training and testing sets')
    plt.show()

In [None]:
def getSize(img):
  """compute the size of the crop inside the image
    Parameters
    ----------
    img: numpy array
         crop image form the dataset

    Returns
    ---------
    c : int
        the crop size in pixels"""
  c=0
  for row in img:
    for pix in row:
      if not math.isnan(pix):
        c+=1
  return c

## Accuracy measure

In [None]:
from sklearn.metrics import confusion_matrix, cohen_kappa_score, ConfusionMatrixDisplay, precision_recall_fscore_support, f1_score
import pandas as pd

def accuracy_metrics(cropcode_true, cropcode_predicted, class_names):
  """compute the accuracy of the predciction.
    Parameters
    ----------
    cropcode_true: list str
         true labels
    cropcode_predicted: list str
         predicted labels
    class_names: list str
        list of the dataset labels
  """
  # compute and plot the confusion matrix
  conf_mx = confusion_matrix(cropcode_true, cropcode_predicted, labels=class_names, normalize='true')
  disp = ConfusionMatrixDisplay(conf_mx, display_labels=class_names)
  disp.plot(cmap='Blues')
  # compute the overall accuracy, kappa coefficient and overall f1-score
  accuracy = np.trace(conf_mx) / float(np.sum(conf_mx))
  print("Overall Accuracy : {:0.4f}  (misclass ={:0.4f})".format(accuracy, 1-accuracy))
  kappa = cohen_kappa_score(cropcode_true, cropcode_predicted)
  print("Cohen's Coefficient : {:0.4f}".format(kappa))
  overall_f1 = f1_score(cropcode_true, cropcode_predicted, average='weighted')
  print("Overall f1_score : {:0.4f}".format(overall_f1))
  # compute the precisions, recalls and f1-scores and print the data frame
  d = {}
  d['precison'], d['recall'], d['f1'], _ = precision_recall_fscore_support(cropcode_true, cropcode_predicted, labels=class_names)
  df = pd.DataFrame(data=d, index=class_names)
  display(df)

# Models

## Dataset preparation

In [None]:
# read and create dataset struct
data = np.load("DataSet/data_12_11.npz", allow_pickle=True)
DS = Dataset(data)

In [None]:
# print info
DS.print_info()

In [None]:
# separate train test
DS.separate_train_test(0.2)

## SVM

#### Hyperparameter Tuning

Tune the Regularization parameter :

In [None]:
from sklearn import svm
C = np.linspace(5, 200, 20)

idx = 0;
score_training = [0.0 for i in range(len(C))]
score_testing  = [0.0 for i in range(len(C))]
bar = progressbar.ProgressBar(maxval=len(C)).start()
# for each C score the training and testing of the model
for c in C:
  clf = svm.SVC(kernel='linear', C=c)
  clf.fit(DS.stats_train, DS.cropcode_train)
  score_training[idx] = round(clf.score(DS.stats_train, DS.cropcode_train),2);
  score_testing[idx] = round(clf.score(DS.stats_test, DS.cropcode_test),2);
  idx +=1;
  bar.update(idx)
# plot the accuracy of the training and testing in function of C
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('C')
ax1.set_ylabel('Accuracy', color=color)
ax1.plot(C, score_training, '-bo', label='Training set')
ax1.plot(C, score_testing, '--b*', label='Testing set')
ax1.tick_params(axis='y', labelcolor=color)
plt.legend(loc='upper left')
plt.grid()
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

Tune gamma for the RBF kernel

In [None]:
from sklearn import svm
C = 100.0
gamma = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
# init the score lists
idx = 0;
score_training = [0.0 for i in range(len(gamma))]
score_testing  = [0.0 for i in range(len(gamma))]
bar = progressbar.ProgressBar(maxval=len(gamma)).start()
# for each gamma score the training and testing of the model
for g in gamma:
  clf = svm.SVC(kernel='rbf', C=C, gamma=g)
  clf.fit(DS.stats_train, DS.cropcode_train)
  score_training[idx] = round(clf.score(DS.stats_train, DS.cropcode_train),2);
  score_testing[idx] = round(clf.score(DS.stats_test, DS.cropcode_test),2);
  idx +=1;
  bar.update(idx)
# plot the accuracy of the training and testing in function of gamma
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('gamma')
ax1.set_ylabel('Accuracy', color=color)
ax1.plot(gamma, score_training, '-bo', label='Training set')
ax1.plot(gamma, score_testing, '--b*', label='Testing set')
ax1.tick_params(axis='y', labelcolor=color)
plt.legend(loc='upper left')
plt.grid()
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

Tune the degree for the polynomial kernel:

In [None]:
from sklearn import svm
C = 100.0
degree = [1,2,3,4,5]
# init the score lists
idx = 0;
score_training = [0.0 for i in range(len(degree))]
score_testing  = [0.0 for i in range(len(degree))]
bar = progressbar.ProgressBar(maxval=len(degree)).start()
# for each degree score the training and testing of the model
for d in degree:
  clf = svm.SVC(kernel='poly', C=C, degree=d)
  clf.fit(DS.stats_train, DS.cropcode_train)
  score_training[idx] = round(clf.score(DS.stats_train, DS.cropcode_train),2);
  score_testing[idx] = round(clf.score(DS.stats_test, DS.cropcode_test),2);
  idx +=1;
  bar.update(idx)
# plot the accuracy of the training and testing in function of the degree
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('degree')
ax1.set_ylabel('Accuracy', color=color)
ax1.plot(degree, score_training, '-bo', label='Training set')
ax1.plot(degree, score_testing, '--b*', label='Testing set')
ax1.tick_params(axis='y', labelcolor=color)
plt.legend(loc='upper left')
plt.grid()
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

#### Final Model

Train the tuned model and assess the prediction produced by this model

In [None]:
from sklearn import svm
ker = 'linear' #best kernel
clf = svm.SVC(kernel=ker, C= 100)
clf.fit(DS.stats_train, DS.cropcode_train)
prediction = clf.predict(DS.stats_test)

# confusion matrix and accuracy metrics
conf_mx, df, accuracy, kappa = accuracy_metrics(DS.cropcode_test, prediction, DS.class_names)

## RF


#### Hyperparameter Tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier

depths = [2,5,8,10,15,20,25,30]
trees = [1, 10, 20, 30, 40, 50]

# for each number of trees and depth score the training and testing of the model
for s in trees:
  idx = 0;
  score_training = [0.0 for i in range(len(depths))]
  score_testing  = [0.0 for i in range(len(depths))]
  time_train  = [0.0 for i in range(len(depths))]
  time_test  = [0.0 for i in range(len(depths))]
  bar = progressbar.ProgressBar(maxval=len(depths)).start()
  for d in depths:
    clf = RandomForestClassifier(max_depth=d, random_state=0, n_estimators=s)
    t = time.process_time()
    clf.fit(DS.stats_train, DS.cropcode_train)
    time_train[idx] = time.process_time() - t
    score_training[idx] = round(clf.score(DS.stats_train, DS.cropcode_train),2);
    time_test[idx] = time.process_time() - t - time_train[idx]
    score_testing[idx] = round(clf.score(DS.stats_test, DS.cropcode_test),2);
    idx +=1;
    bar.update(idx)
  # plot the accuracy of the training and testing in function of the depth for the number of tree s
  fig, ax1 = plt.subplots()
  color = 'tab:blue'
  ax1.set_xlabel('Depth (trees ='+str(s)+')')
  ax1.set_ylabel('Accuracy', color=color)
  ax1.plot(depths, score_training, '-bo', label='Training set')
  ax1.plot(depths, score_testing, '--b*', label='Testing set')
  ax1.tick_params(axis='y', labelcolor=color)
  plt.legend(loc='upper left')
  plt.grid()
  plt.show()

#### Final Model

Train the tuned model and assess the prediction produced by this model

In [None]:
from sklearn.ensemble import RandomForestClassifier
s = 20 # best number of trees
d = 10 # best depth

clf = RandomForestClassifier(max_depth=d, random_state=0, n_estimators=s)
clf.fit(DS.stats_train, DS.cropcode_train)
prediction = clf.predict(DS.stats_test)

# confusion matrix and accuracy metrics
conf_mx, df, accuracy, kappa = accuracy_metrics( DS.cropcode_test, prediction, DS.class_names)