## **Data Understanding & Preparation**

---



In [1]:
#Import of the relevant libaries
import numpy as np
import matplotlib.pyplot as plt 
import os
import cv2
import random
from google.colab import files    
import time
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf
from keras.utils.np_utils import to_categorical 
from PIL import Image, ImageStat
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Table of contents

[Data Understanding](#Data_Understanding)

[Data Preparation](#Data_Preparation)

### Data Understanding

---


<a id='Data_Understanding'></a>

**Data import**

In [2]:
#The trainingsdata should be found in the following path, separated by country in a folder of the same name
DATADIR_Train ="/content/drive/MyDrive/Images/Train/"
DATADIR_Test ="/content/drive/MyDrive/Images/Test/"
CATEGORIES = ["Tel-Aviv","WestJerusalem" ,"Berlin", "Hamburg"]

In [4]:
#This is only necessary when you use google.colab and the trainingsdata are stored in google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
IMAGE_SIZE = 224

### Data Preparation

---


<a id='Data_Preparation'></a>

Import all images and resize them


### Create Trainingsdata

In [7]:
training_data = []

def create_training_data():
    for category in CATEGORIES:
        path = os.path.join(DATADIR_Train, category) # path to the differen images
        class_num = CATEGORIES.index(category)
        for img in os.listdir(path):
            # open the file with opencv
              img_arry = cv2.cvtColor(cv2.imread(os.path.join(path, img), cv2.IMREAD_ANYCOLOR), cv2.COLOR_BGR2RGB) #read images with RGB
              new_array = cv2.resize(img_arry, (IMAGE_SIZE, IMAGE_SIZE))
              training_data.append([new_array, class_num])


In [8]:
test_data = []
def create_test_data():
    for category in CATEGORIES:
        path = os.path.join(DATADIR_Test, category) # path to the differen images
        class_num = CATEGORIES.index(category)
        for img in os.listdir(path):
            # open the file with opencv
              img_arry = cv2.cvtColor(cv2.imread(os.path.join(path, img), cv2.IMREAD_ANYCOLOR), cv2.COLOR_BGR2RGB) #read images with RGB
              new_array = cv2.resize(img_arry, (IMAGE_SIZE, IMAGE_SIZE))
              test_data.append([new_array, class_num])

In [9]:
create_training_data()
create_test_data()
#shuffle the data, that not all images with the same label follow one another
random.shuffle(training_data)
random.shuffle(test_data)

In [10]:
x_train = []
y_train = []
IMAGE_SIZE = 224


# Split the features and the label in different variables
for features, label in training_data:
    x_train.append(features)
    y_train.append(label)

# tensorflow needs an numpy array, so its necessary to transform the data    
x_train = np.array(x_train).reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 3)
y_train = np.array(to_categorical(y_train))

In [11]:
x_test = []
y_test = []
IMAGE_SIZE = 224


# Split the features and the label in different variables
for features, label in test_data:
    x_test.append(features)
    y_test.append(label)

# tensorflow needs an numpy array, so its necessary to transform the data    
x_test = np.array(x_test).reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 3)
y_test = np.array(to_categorical(y_test))

Data normalization

In [14]:
x_train = (x_train / 127.5) -1


In [12]:
x_test = (x_test/ 127.5) -1 

Safe Data to use it for the different model

In [None]:
pickle_out = open("/content/drive/MyDrive/data/x_train.pickle","wb")
pickle.dump(x_train, pickle_out, protocol=4)
pickle_out.close()

pickle_out = open("/content/drive/MyDrive/data/y_train.pickle","wb")
pickle.dump(y_train, pickle_out)
pickle_out.close()

In [13]:
pickle_out = open("/content/drive/MyDrive/data/x_test.pickle","wb")
pickle.dump(x_test, pickle_out, protocol=4)
pickle_out.close()

pickle_out = open("/content/drive/MyDrive/data/y_test.pickle","wb")
pickle.dump(y_test, pickle_out)
pickle_out.close()