## Quick, Draw! Doodle Recognition

#### _ 2019 Winter Coding _ 이기인

1. 프레임워크
   - Keras를 사용했습니다.
   - Tensorflow 역시 사용할 줄 알지만, Keras가 모델 구축에 훨씬 편리하기 때문에 짧은 시간 내에 모델을 테스트하는데 적합하다고 생각했습니다.
2. 데이터
   - 학습 데이터셋은 총 340개의 클래스를 가지고 있고, 각 클래스별로 10,000개의 데이터를 가지고 있습니다.
   - Overfitting을 방지하고 학습데이터를 최대한 활용하기 위해서 클래스별 CSV 파일들을 Shuffle해서 사용했고, 이를 Training에 사용할 때는 Cross-Validation 방식을 적용했습니다.
   - 하드웨어의 제한으로 학습에 simplified dataset을 사용했습니다.
   - 데이터에는 stroke가 기록되어있는데, 학습에 사용하기 위해서 3가지 encoding을 바탕으로 전처리를 했습니다.
     1. stroke 여부 기준 : stroke가 된 부분은 255, 아니면 0
     2. stroke 시간 기준 : 보통 윤곽을 먼저 그리고 디테일한 부분을 나중에 그립니다. 그래서 첫번째 stroke에 255를 주고, 125가 될 때 까지 다음 stroke는 13씩 값을 감소시켰습니다.
     3. 각 stroke에서의 시간 기준 : 각 stroke에서 point가 찍힌 시간을 기준으로 가중치를 준다면, stroke의 방향을 알 수 있습니다. 첫번째 point는 255를, 그리고 그 다음부터 20이 될 때까지 감소시켰습니다.
3. 모델
   - keras.application 패키지의 MobileNet을 사용했습니다.
   - 캐글 커널에서만 작업해야하는 조건이었고, 최대한 가벼우면서 성능이 좋은 모델을 선택해야했고, MobileNet과 ResNet18이 후보였습니다. 일주일간 튜닝을 하면서 학습해본 결과 ResNet18보다는 MobileNet의 성능이 높게 나와서 최종적으로 MobileNet을 사용했습니다.

## 📑 Import Libraries

In [None]:
import numpy as np
import pandas as pd
import cv2

import json
import datetime as dt
from tqdm import tqdm

import ast
import math
from glob import glob
import glob
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from multiprocessing.dummy import Pool
from keras.models import load_model
import time
import keras
import random

from skimage.draw import draw
import matplotlib.pyplot as plt
import matplotlib.style as style
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
print(os.listdir("../input/mobilenetfile"))
print(os.listdir("./"))

## 📥 Shuffle CSVs and Load Data
### 1. Shuffle CSVs
<a href="https://kaggle.com/gaborfodor/shuffle-csvs">beluga</a> 님의 코드를 참고했습니다.

In [None]:
def f2cat(filename: str) -> str:
    return filename.split('.')[0]

class Simplified():
    def __init__(self, input_path='./input'):
        self.input_path = input_path

    def list_all_categories(self):
        files = os.listdir(os.path.join(self.input_path, 'train_simplified'))
        return sorted([f2cat(f) for f in files], key=str.lower)

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
        df = pd.read_csv(os.path.join(self.input_path, 'train_simplified', category + '.csv'),
                         nrows=nrows, parse_dates=['timestamp'], usecols=usecols)
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(json.loads)
        return df

In [None]:
# shuffle csv 만든적이 없다면 주석 풀고 실행 #

# PATH = '../input/quickdraw-doodle-recognition'

# start = dt.datetime.now()
# s = Simplified(PATH)
# NCSVS = 100
# categories = s.list_all_categories()
# print(len(categories))

# for y, cat in tqdm(enumerate(categories)):
#     df = s.read_training_csv(cat, nrows=30000)
#     df['y'] = y
#     df['cv'] = (df.key_id // 10 ** 7) % NCSVS
#     for k in range(NCSVS):
#         filename = 'train_k{}.csv'.format(k)
#         chunk = df[df.cv == k]
#         chunk = chunk.drop(['key_id'], axis=1)
#         if y == 0:
#             chunk.to_csv(filename, index=False)
#         else:
#             chunk.to_csv(filename, mode='a', header=False, index=False)

# for k in tqdm(range(NCSVS)):
#     filename = 'train_k{}.csv'.format(k)
#     if os.path.exists(filename):
#         df = pd.read_csv(filename)
#         df['rnd'] = np.random.rand(len(df))
#         df = df.sort_values(by='rnd').drop('rnd', axis=1)
#         df.to_csv(filename + '.gz', compression='gzip', index=False)
#         os.remove(filename)
# print(df.shape)

# end = dt.datetime.now()
# print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))

### 2. Load Data
<a href="https://www.kaggle.com/echomil/mobilenet-126x126x3-100k-per-class">Pawel Mieloch</a> 님의 코드를 참고했습니다.

In [None]:
INPUT_DIR = '../input/quickdraw-doodle-recognition/'
BASE_SIZE = 256

# Cross Validation을 위해 추가
def split_train_val(): 
    ALL_FILES = glob.glob('../input/shuffle-csvs/*.csv.gz')
    VALIDATION_FILE = '../input/shuffle-csvs/train_k'+str(int(random.random()*93))+'.csv.gz'
    ALL_FILES.remove(VALIDATION_FILE)
    np.random.seed(seed=1987)
    return ALL_FILES, VALIDATION_FILE


def apk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


def preds2catids(predictions):
    return pd.DataFrame(np.argsort(-predictions, axis=1)[:, :3], columns=['a', 'b', 'c'])

def f2cat(filename: str) -> str:
    return filename.split('.')[0]

def list_all_categories():
    files = os.listdir(os.path.join(INPUT_DIR, 'train_simplified'))
    return sorted([f2cat(f) for f in files], key=str.lower)


def plot_batch(x):    
    cols = 4
    rows = 6
    fig, axs = plt.subplots(nrows=rows, ncols=cols, sharex=True, sharey=True, figsize=(18, 18))
    for i in range(rows):
        for k in range(0,3):
            ax = axs[i, k]
            ax.imshow(x[i, :, :, k], cmap=plt.cm.gray)
            ax.axis('off')
        ax = axs[i, 3]
        ax.imshow(x[i, :, :], )
        ax.axis('off')
    fig.tight_layout()
    plt.show();

## 💻 Predictive Modeling
### 1. Learning and data Hyper parameters

In [None]:
AUGMENTATION = True
STEPS = 200
BATCH_SIZE = 400
EPOCHS = 10
NCATS = 340
LEARNING_RATE = 0.002

IMG_SHAPE = (128,128,3)
IMG_SIZE = IMG_SHAPE[0]

### 2. Image Encoding

In [None]:
def draw_cv2(raw_strokes, size=256, lw=6, augmentation = False):
    img = np.zeros((BASE_SIZE, BASE_SIZE, 3), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        points_count = len(stroke[0]) - 1
        grad = 255//points_count
        for i in range(len(stroke[0]) - 1):
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]), (stroke[0][i + 1], stroke[1][i + 1]), (255, 255 - min(t,10)*13, max(255 - grad*i, 20)), lw)
    if size != BASE_SIZE:
        img = cv2.resize(img, (size, size))
    if augmentation:
        if random.random() > 0.5:
            img = np.fliplr(img)
    return img

### 3. Data generators
Shuffle한 데이터 파일들을 하나로 통합해서 generator에 사용

In [None]:
def image_generator(size, batchsize, lw=6, augmentation = False):
    while True:
        for filename in ALL_FILES:
            for df in pd.read_csv(filename, chunksize=batchsize):
                df['drawing'] = df['drawing'].apply(eval)
                x = np.zeros((len(df), size, size,3))
                for i, raw_strokes in enumerate(df.drawing.values):
                    x[i] = draw_cv2(raw_strokes, size=size, lw=lw, augmentation = augmentation)
                x = x / 255.
                x = x.reshape((len(df), size, size, 3)).astype(np.float32)
                y = keras.utils.to_categorical(df.y, num_classes=NCATS)
                yield x, y

def valid_generator(valid_df, size, batchsize, lw=6):
    while(True):
        for i in range(0,len(valid_df),batchsize):
            chunk = valid_df[i:i+batchsize]
            x = np.zeros((len(chunk), size, size,3))
            for i, raw_strokes in enumerate(chunk.drawing.values):
                x[i] = draw_cv2(raw_strokes, size=size, lw=lw)
            x = x / 255.
            x = x.reshape((len(chunk), size, size,3)).astype(np.float32)
            y = keras.utils.to_categorical(chunk.y, num_classes=NCATS)
            yield x,y
        
def test_generator(test_df, size, batchsize, lw=6):
    for i in range(0,len(test_df),batchsize):
        chunk = test_df[i:i+batchsize]
        x = np.zeros((len(chunk), size, size,3))
        for i, raw_strokes in enumerate(chunk.drawing.values):
            x[i] = draw_cv2(raw_strokes, size=size, lw=lw)
        x = x / 255.
        x = x.reshape((len(chunk), size, size, 3)).astype(np.float32)
        yield x
        

ALL_FILES, VALIDATION_FILE = split_train_val()
train_datagen = image_generator(size=IMG_SIZE, batchsize=BATCH_SIZE, augmentation = AUGMENTATION)

valid_df = pd.read_csv(VALIDATION_FILE)
valid_df['drawing'] = valid_df['drawing'].apply(eval)
validation_steps = len(valid_df)//BATCH_SIZE
valid_datagen = valid_generator(valid_df, size=IMG_SIZE, batchsize=BATCH_SIZE)

### 4.Visualization of image encoding

In [None]:
single_class_df = valid_df[valid_df['y'] == 2]
single_class_gen = valid_generator(single_class_df, size=IMG_SIZE, batchsize=BATCH_SIZE)
x, y = next(single_class_gen)
plot_batch(x)

### 5. Model definition

In [None]:
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.optimizers import Adam
from keras.applications.mobilenet import MobileNet
from keras.applications.mobilenet import preprocess_input
from keras.models import load_model

def top_3_accuracy(y_true, y_pred):
    return keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)

reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
checkpointer = ModelCheckpoint(filepath='mobileNet_ckpt.hdf5', verbose=2, save_best_only=True)
model = load_model('../input/mobilenetfile/mobileNet.hdf5', custom_objects = {'top_3_accuracy':top_3_accuracy})
opt = Adam(lr = LEARNING_RATE)
model.compile(optimizer = opt, loss = 'categorical_crossentropy', metrics = ['accuracy', top_3_accuracy])
model.summary()

### 6. Training

In [None]:
history = model.fit_generator(train_datagen,
                              steps_per_epoch=STEPS,
                              epochs=EPOCHS,
                              verbose=2,
                              validation_data=valid_datagen,
                              validation_steps=validation_steps,
                              callbacks=[checkpointer,reducer])
model.save('mobileNet.hdf5')

## 💻 Result

In [None]:
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'test_simplified.csv'))
submission_df['drawing'] = submission_df['drawing'].apply(eval)
submission_datagen = test_generator(submission_df, size=IMG_SIZE, batchsize=BATCH_SIZE)
submission_predictions = model.predict_generator(submission_datagen, math.ceil(len(submission_df)/BATCH_SIZE))
cats = list_all_categories()
id2cat = {k: cat.replace(' ', '_') for k, cat in enumerate(cats)}
top3 = preds2catids(submission_predictions)
top3cats = top3.replace(id2cat)
submission_df['word'] = top3cats['a'] + ' ' + top3cats['b'] + ' ' + top3cats['c']
submission = submission_df[['key_id', 'word']]
submission.to_csv('submission.csv', index=False)