### Imports

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import image
import collections
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import multilabel_confusion_matrix
#import keras
#from keras.models import Sequential
#from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Input
#from keras.utils import to_categorical
#from keras.preprocessing import image

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Conv2D, Activation, Flatten, Dense, MaxPooling2D, BatchNormalization, Dropout

import PIL
from copy import deepcopy

### Loading in data, train and test sets

In [None]:
train = pd.read_pickle("./data/processed/train_less_labels_5.pkl")
test = pd.read_pickle("./data/processed/test_cleaned.pkl")

In [None]:
train.head()

### How to process images?

In [None]:
train_X = train.image_data
train_y = train.drop(["image_id", "labels", "image_data"], axis = 1)
test_X = test.image_data

Images have different sizes

In [None]:
min_width = 5000
max_width = 0
min_height = 5000
max_height = 0
widths = []
heights = []
for i in range(len(train_X)):
  h, w, c = train_X.iloc[i].shape
  widths.append(w)
  heights.append(h)
  #print(h, w, c)
  if h > max_height:
    max_height = h
  elif h < min_height:
    min_height = h
  elif w > max_width:
    max_width = w
  elif w < min_width:
    min_width = w
print("min w-h", min_width, min_height)
print("max w-h", max_width, max_height)
print(f"Avg w {sum(widths) / len(widths)} and h {sum(heights) / len(heights)}.")

In [None]:
a = np.array([[2,3],[4,4], [2,3]])
a.shape
train_X.iloc[0].shape

In [None]:
img = PIL.Image.fromarray(train_X[0])
img

In [None]:
a = deepcopy(train_X[0])
h_centre = int(train_X[0].shape[0] / 2)
w_centre = int(train_X[0].shape[1] / 2)
print(w_centre, h_centre)
ab = a[w_centre-200:w_centre + 200, h_centre - 142:h_centre + 142]
im = PIL.Image.fromarray(ab)
im

Cropping all images to the smallest image size loses a lot of information.

Resizing to avg.

In [None]:
im = PIL.Image.fromarray(train_X[0])
img_resized = im.resize((572, 432))
img_resized

In [None]:
im = PIL.Image.fromarray(train_X[0])
img_resized = im.resize((572, 432), PIL.Image.LANCZOS)
img_resized

In [None]:
im = PIL.Image.fromarray(train_X[0])
img_resized = im.resize((572, 432), PIL.Image.BICUBIC)
img_resized

In [None]:
train_X_resized = []
for i in range(len(train_X)):
  im = PIL.Image.fromarray(train_X.iloc[i])
  img_resized = im.resize((572, 432), PIL.Image.BICUBIC)
  # to grayscale
  #img_grayscaled = img_resized.convert("L")
  train_X_resized.append(np.array(img_resized))

In [None]:
train_X_resized = np.array(train_X_resized)
im = PIL.Image.fromarray(train_X_resized[0])
im

In [None]:
train_X_normalized = train_X_resized / 255
train_X_normalized[0].shape

In [None]:
#train_X_reshaped = train_X_normalized.reshape(-1, 432, 572, 1)
#train_X_reshaped[0].shape

### Models

In [None]:
train_y_np = train_y.to_numpy("float64")
len(train_y_np[0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_X_normalized, train_y, random_state=42, test_size=0.2)

In [None]:
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=(5, 5), activation="relu", input_shape=(432, 572, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters=32, kernel_size=(5, 5), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(Dropout(0.25))
model.add(Conv2D(filters=64, kernel_size=(5, 5), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='sigmoid'))

In [None]:
model.summary()

Currently missing F1 metric

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=16)

In [None]:
# Loss and accuracy on test set
model.evaluate(X_test, y_test)

In [None]:
preds = model.predict(X_test)
#multilabel_confusion_matrix(y_test, preds)
preds

In [None]:
# get top 3 preds
import heapq
cols = train_y.columns.tolist()
as_dataframe = pd.DataFrame(preds, columns = train_y.columns.tolist())
min_treshold = 0.10
row_results = []

for index, row in as_dataframe.iterrows():
    res = []
    col = 0
    values = row.to_numpy()
    top3 = np.argpartition(values, -3)[-3:]
    top3 = np.flip(top3[np.argsort(values[top3])])
    
    for top in top3:
        if values[top] >= min_treshold:
            res.append(cols[top])
    row_results.append(res)
    #print(res)
print(row_results)
