# Dataset Processing and Creation


This file contains all the functions and codes to process and create the dataset as well as two tests for features and image visualisation

## Import


Import all the useful libraries used in this notebook.

In [1]:
from skimage.io import imread, imshow
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
import progressbar
import time
from collections import Counter
import random
import os
import pandas as pd
import math

This cell is to access the Google Drive files when running on google collab.

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Projet/code

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Projet/code


## Data selection Functions

In [3]:
""" Plot the data occurence in function of the type
   args
         data (Counter Object) : countains the dataset keys and its occurences"""
def plot_data(data):
  plt.bar(data.keys(), data.values());
  plt.xlabel('Labels')
  plt.ylabel('Occurrence')
  plt.title('Occurrence of crops types')
  plt.show()


"""compute the size of the crop inside the image
    args
         img (numpy array): crop image form the dataset

    return
        c (int): the crop size in pixels"""
def getSize(img):
  c=0
  for row in img[0]:
    for pix in row:
      if not math.isnan(pix):
        c+=1
  return c


"""create and process a dataset from wich the clouds pixels have been removed
   args
         gj (file): Digimap geo jason file that countains the crops information
         cover_perc(int): cloud covering above which the image is not kept
         fold_path (str): Path to the parcels image folder
  return
        data_set (dict): processed dataset"""
def getData_noClouds(gj, cover_perc, fold_path):
  # init an empty data set
  data_set = {"image":[], "cropcode":[]}
  # Get the image for each crop in geo jason file
  idx=0;
  bar = progressbar.ProgressBar(maxval=len(gj['features'])).start()
  for f in gj['features']:
    # get the crop code
    cc = f['properties']['crop_code']
    # get the crop gid and the image file name
    gid = f['properties']['gid']
    path = fold_path+gid+'.tif'
    # verify if the image exist in the folder
    if os.path.exists(path):
      # read the image
      img = imread(path)
      # compute the non-cloud mask by searching all the values above 2^12
      mask = (img[:, :, 3]<=2**12)|(img[:, :, 2]<=2**12)|(img[:, :, 1]<=2**12)|(img[:, :, 7]<=2**12)
      # if the percentage of non-cloud pixel is superior to the cover_perc,
      # replace the pixels that the mask indicates to be a cloud (mask[i][j]=False) by numpy nan
      if ((np.count_nonzero(mask)/( getSize(img))*100) > cover_perc):
        for i in range(len(mask)):
          for j in range(len(mask[0])):
            if not mask[i][j]:
              img[i,j, [3, 2, 1, 7]]=[np.nan, np.nan, np.nan, np.nan]
        # add the processed image and its corresponding crop code in the dataset
        data_set['image'].append(img)
        data_set['cropcode'].append(cc)
    idx +=1;
    bar.update(idx)
  return data_set


"""Balance the dataset to get for each type between x and x+10% data
  args:
         data_set(dict): dataset to balance
  return:
          data(dict): balance dataset"""
def equalize(data_set):
  # create a counter object
  c = Counter(data_set['cropcode'])
  key_value = {}
  for i in c.keys():
    key_value[i] = c[i]
  plot_data(key_value)
  # plot the occurences of the dataset
  print(key_value.keys())
  print(key_value.values())
  print()
  # ask the user to give a lower limit of its choice
  print("Choose an interval")
  print("lower limit")
  low = int(input())
  print("upper limit at 10%")
  up = int(low+low*0.1)
  print(up)
  # update the counter so that occurences below the lower limit choose by the user is passed to 0
  # else a random occurence will be used
  for k in key_value:
    v = key_value[k]
    if v<low:
      key_value[k]=0
    else:
      rd = random.randint(low, up)
      if v<rd:
        key_value[k]=v
      else:
        key_value[k]=rd
  # 'ot' (other crops) is passed to 0
  key_value['ot']=0
  print(key_value)
  plot_data(key_value)
  # create the new dataset by keeping the number of data determine in the updated counter
  count = key_value
  cropcode = []
  image = []
  idx=0;
  bar = progressbar.ProgressBar(maxval=len(data_set['cropcode'])).start()
  # for each data in the original dataset if the number have not yet been reached add a data in the dataset
  for f in range(len(data_set['cropcode'])):
    cc = data_set['cropcode'][f]
    img = data_set['image'][f]
    if count[cc]!=0:
      cropcode.append(cc)
      image.append(img)
      count[cc] = count[cc] - 1
    idx +=1;
    bar.update(idx)
  data = {'image' : image, 'cropcode' : cropcode}
  return data



## Create Dataset

This part show the use of the functions developped above

Open the Digimap geo jasonn file:

In [4]:
with open('DataSet/Digimap_data/Train_labels_epsg_32630.geojson') as f:
    gj = json.load(f)

Get the dataset whith cloud processing and equalize it:

In [None]:
data = getData_noClouds(gj, 60, 'DataSet/Prepared_Train_Dataset/Shapes/20190724/')
dataset = equalize(data)

Save the dataset:

In [None]:
np.save('DataSet/data_03_03_july.npy',dataset)

## test stats


In [None]:
# Load the dataset
dataset = np.load("DataSet/data_03_03_july.npy", allow_pickle=True).item()

In [None]:
# get the images and the corresponding crops from the choosed dataset
image = dataset['image']
cropcode = dataset['cropcode']
print("ww = Winter wheat" + '\n' +
      "sb = Spring barley" + '\n' +
      "sw = Spring Wheat" + '\n' +
      "wb = Winter barley" + '\n' +
      "be = Beet (sugar beet / fodder beet)" + '\n' +
      "fb = Field beans" + '\n' +
      "ma = Maize" + '\n' +
      "or = Oilseed rape" + '\n' +
      "po = Potatoes" + '\n' +
      "gr = Grass" + '\n' +
      "pe = Peas" + '\n' +
      "wo = Winter oats")
# init the dictionary that will countains the stats
d = { 'ww' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'sb' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'sw' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'wb' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'be' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'fb' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'ma' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'or' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'po' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'gr' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'pe' : np.asarray([0,0,0,0,0,0,0,0,0]),
      'wo' : np.asarray([0,0,0,0,0,0,0,0,0])
      }
# compute the mean of each chosen features for each crop type
for i in range(len(cropcode)):
  img = image[i]/(2**4)
  stats =[getSize(img),   np.nanmean(img[:, :,3]), np.nanmean(img[:, :,2]),
          np.nanmean(img[:, :,1]), np.nanmean(img[:, :,7]), np.nanstd(img[:, :,3]),
          np.nanstd(img[:, :,2]),  np.nanstd(img[:, :,1]),  np.nanstd(img[:, :,7])]
  prev_stats = d[cropcode[i]]
  new_stats = (prev_stats*i + stats)/(i+1)
  d[cropcode[i]] = np.round(new_stats,2)
print()
df = pd.DataFrame(data=d, index=["size", "mean Red", "mean Green", "mean Blue", "mean IR", "std Red", "std Green", "std Blue", "std IR"])
display(df)

## Images Visualization

In [None]:
# Load the dataset
dataset = np.load("DataSet/data_03_03_may_reduced.npy", allow_pickle=True).item()

In [None]:
# get the images and the corresponding crops from the choosed dataset
image = dataset['image']
cropcode = dataset['cropcode']
# plot the images of the dataset that corresponds to the labels
codes = ['gr', 'sb', 'sw', 'wb', 'ww', 'or']
for i in range(len(cropcode)):
  if cropcode[i] in codes:
    img = image[i]
    img = img/(2**12)
    if 1:
      mx = np.ma.masked_array(img[:,:,3], np.logical_not(np.isnan(img[:,:,3])))
      img[:, :,0]=np.array(mx.mask, dtype=int)
      plt.figure()
      plt.imshow(img[:, :,[3, 2, 1, 0]])
      print(cropcode[i], i)
      plt.show()