# Setup 

In [1]:
import tensorflow
from tensorflow import keras
from keras.datasets import mnist
from keras import backend as K
from matplotlib import pyplot as plt
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Activation, BatchNormalization, Conv2D, MaxPooling2D, Input, Dropout, GlobalAveragePooling2D,concatenate
from PIL import Image
import numpy as np
import PIL.ImageOps 
import csv
import zipfile
import os
import cv2
from natsort import natsorted
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd

In [9]:
!pip install split-folders --quiet

Mount to Google Drive folder. Make sure the folder `malaria_project` is inside `MyDrive`. If it is in `shared with me` right click and make a shortcut to `MyDrive`. 

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Variables:

In [14]:
BATCH_SIZE=32
EPOCHS = 5

### Setting up the satellite data
First we split the satellite images into `test`,`train`, and `validation` folders

# TODO: Change it such that the entire satellite folder is within malaria_project

In [12]:
import splitfolders
splitfolders.ratio("/content/drive/MyDrive/x_data/x2", output="output",
    seed=1337, ratio=(.7, .2, .1), group_prefix=None, move=False) # (train:validation:test)


Copying files: 5133 files [07:03, 12.11 files/s] 


As we are going to use `DataFrames` to our `ImageDataGenerators` we need to create them such that the entire filename is the `x_col` entry, and we can access the label. It is required that the name of the entry in `x_col` has the exact same name as the satellite image in order to match them.

In [10]:
'''
This function is responsible for creating the dataframe containing the labels and image data
'''
def createDataFrame(label_dir, data_dir, ending=True):
  #Retrieve the filenames and ids.
  filenames = []
  ids = []
  for file in os.listdir(data_dir):
    if file == ".DS_Store":
      continue
    filenames.append(file)
    if ending:
      ids.append(file.split("_")[0])
    else:
      ids.append(file.split(".")[0])
  #Create empty DataFrame
  df = pd.DataFrame()
  #Append the data
  df['filename'] = filenames
  df['id'] = ids

  #Append the labels
  start_list = []
  with open(label_dir) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for i,line in enumerate(csv_reader):
      #print(line[0])
      id = line[0]
      pr = line[4]
      for img in filenames:
        if ending:
          img_id = img.split('_')[0]
        else:
          img_id = img.split(".")[0]
          
        if img_id == id:
          index = np.where(df["filename"] == img)
          start_list.append(float(pr))
    df['label'] = start_list
  return df.sort_values(by='id')

Creating the `DataFrames`

In [13]:
train_data_dir = 'output/train/satellite_data_v2'
test_data_dir = 'output/test/satellite_data_v2'
val_data_dir = 'output/val/satellite_data_v2'
label_dir = '/content/drive/MyDrive/malaria_project/long_lat_year_with_confidential_from2010to18_size10orGreater.csv'
train_df = createDataFrame(label_dir=label_dir, data_dir=train_data_dir)
test_df = createDataFrame(label_dir=label_dir, data_dir=test_data_dir)
val_df = createDataFrame(label_dir=label_dir, data_dir=val_data_dir)

### Setting up the landcover data

In [15]:
'''Finding the names of the files that needs to be copied'''
x_names = []
for file_name in os.listdir("/content/drive/MyDrive/x_data/x2/satellite_data_v2"):
  x_names.append(file_name.split("_")[0])

In [16]:
'''Loops through the images and copies the corresponding land cover'''
import os
import shutil

source_folder = r"/content/drive/MyDrive/malaria_project/landcover_with_confi_2010_2018_size10andGreater"
if not os.path.exists('landcover_full'):
   os.makedirs('landcover_full')
destination_folder = r"landcover_full"

for file_name in os.listdir(source_folder):
  for x_name in x_names:
    if file_name.split(".")[0] == x_name:
      source = source_folder + "/"+ file_name
      destination = destination_folder + "/" + file_name
      shutil.copy(source, destination)

# TODO: Fix such that the landcover images has the correct size

In [17]:
#crop the landcovers
for i,image in enumerate(os.listdir("landcover_full")):
    if image == ".DS_Store" or image ==".ipynb_checkpoints":
        continue
    img = Image.open("landcover_full/"+image)
    if img.size != (8, 8):
      img = img.resize((8,8))
      img.save('landcover_full/'+image, 'TIFF')


As we have the images locally we can split it into folders. First we create all the directories and then we copy the images into the correct ones, according to the satellite data. 

In [18]:
#create the directories
if not os.path.exists('output_landcover'):
   os.makedirs('output_landcover')
#testing
if not os.path.exists('output_landcover/test'):
   os.makedirs('output_landcover/test')
if not os.path.exists('output_landcover/test/landcover'):
   os.makedirs('output_landcover/test/landcover')
#training
if not os.path.exists('output_landcover/train'):
   os.makedirs('output_landcover/train')
if not os.path.exists('output_landcover/train/landcover'):
   os.makedirs('output_landcover/train/landcover')
#validation
if not os.path.exists('output_landcover/val'):
   os.makedirs('output_landcover/val')
if not os.path.exists('output_landcover/val/landcover'):
   os.makedirs('output_landcover/val/landcover')

In [19]:
'''Copies the files from the landcover directory to the correct splitted folders'''
def copyfiles(dir, src_dir, dest_dir):
  for file_name in os.listdir(dir):
    file_name_no_coords = file_name.split("_")[0]
    source = src_dir + file_name_no_coords + ".tiff"
    destination = dest_dir + file_name_no_coords + ".tiff"
    shutil.copy(source, destination)
  
copyfiles(dir="output/test/satellite_data_v2", src_dir="landcover_full/", dest_dir="output_landcover/test/landcover/")
copyfiles(dir="output/train/satellite_data_v2", src_dir="landcover_full/", dest_dir="output_landcover/train/landcover/")
copyfiles(dir="output/val/satellite_data_v2", src_dir="landcover_full/", dest_dir="output_landcover/val/landcover/")

Lastly we can create the `DataFrames`

In [20]:
train_data_dir_landcover = 'output_landcover/train/landcover'
test_data_dir_landcover = 'output_landcover/test/landcover'
val_data_dir_landcover = 'output_landcover/val/landcover'
label_dir_landcover = '/content/drive/MyDrive/malaria_project/long_lat_year_with_confidential_from2010to18_size10orGreater.csv'
train_landcover_df = createDataFrame(label_dir=label_dir_landcover, data_dir=train_data_dir_landcover, ending=False)
test_landcover_df = createDataFrame(label_dir=label_dir_landcover, data_dir=test_data_dir_landcover, ending=False)
val_landcover_df = createDataFrame(label_dir=label_dir_landcover, data_dir=val_data_dir_landcover, ending=False)

## Constructing the `ImageDataGenerators`

In [22]:
#Satellite
imgen_sat = ImageDataGenerator()
train_generator_sat = imgen_sat.flow_from_dataframe(dataframe=train_df, 
                                            directory=train_data_dir, 
                                            x_col="filename", 
                                            y_col="label", 
                                            has_ext=True,
                                            shuffle=True,
                                            batch_size=BATCH_SIZE, 
                                            class_mode="other", 
                                            target_size=(1024,1024))
#valgen_sat = ImageDataGenerator()
#val_generator_sat = valgen_sat.flow_from_dataframe(dataframe=val_d)


#Landcover
imgen_lc = ImageDataGenerator()
train_generator_lc = imgen_lc.flow_from_dataframe(dataframe=train_landcover_df, 
                                            directory=train_data_dir_landcover, 
                                            x_col="filename", 
                                            y_col="label", 
                                            has_ext=True,
                                            shuffle=True,
                                            batch_size=BATCH_SIZE, 
                                            class_mode="other", 
                                            target_size=(8,8))

Found 3593 validated image filenames.
Found 3593 validated image filenames.


In [24]:
class JoinedGen(tensorflow.keras.utils.Sequence):
    '''
    This clas is reponsible for manipulating the outputs of the generators 
    to have the correct form which our model can take as input. It takes two 
    ImageDataGenerator instances as input.

    Args: 
        input_gen1: An ImageDataGenerator instance.
        input_gen2: An ImageDataGenerator instance
    '''
    def __init__(self, input_gen1, input_gen2):
        self.gen1 = input_gen1
        self.gen2 = input_gen2

        assert len(input_gen1) == len(input_gen2)

    def __len__(self):
        return len(self.gen1)

    def __getitem__(self, i):
        x1,y1 = self.gen1[i]
        x2,y2 = self.gen2[i]

        return [x1,x2],y1 #as the labels are always the same, we can simply output the first.

    def on_epoch_end(self):
        self.gen1.on_epoch_end()
        self.gen2.on_epoch_end()
        self.gen2.index_array = self.gen1.index_array #responsible for shuffling


## Constructing the model

In [26]:
'''Toy model'''
sat_input = keras.Input(shape=(1024,1024,3))
lc_input = keras.Input(shape=(8,8,3))

x = MaxPooling2D((128,128))(sat_input)
x = concatenate([x,lc_input])
x = Flatten()(x)
x = Dense(1, activation="linear")(x)
new_simple_model = Model(inputs=[sat_input, lc_input], outputs=x) 
new_simple_model.compile(optimizer='adam', loss='mse')
summary = new_simple_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1024, 1024,  0           []                               
                                 3)]                                                              
                                                                                                  
 max_pooling2d_1 (MaxPooling2D)  (None, 8, 8, 3)     0           ['input_3[0][0]']                
                                                                                                  
 input_4 (InputLayer)           [(None, 8, 8, 3)]    0           []                               
                                                                                                  
 concatenate_1 (Concatenate)    (None, 8, 8, 6)      0           ['max_pooling2d_1[0][0]',  

In [29]:
my_gen = JoinedGen(train_generator_sat, train_generator_lc)
vgg_history = new_simple_model.fit(my_gen,
                            batch_size=BATCH_SIZE,
                            epochs=EPOCHS,
                            steps_per_epoch=train_generator_sat.samples//BATCH_SIZE,
                            #validation_data=val_generator,
                            #,
                            #validation_steps=n_val_steps,
                            #callbacks=[tl_checkpoint_1, early_stop, plot_loss_1],
                            verbose=1)

Epoch 1/5

KeyboardInterrupt: ignored