## Shuffle CSV
#### Do not run if you already have the shuffled training files. 

In [1]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import json
import os
import datetime as dt
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
def f2cat(filename: str) -> str:
    return filename.split('.')[0]

class Simplified():
    def __init__(self, input_path='/home/k.vincent/apm_file/'):
        self.input_path = input_path

    def list_all_categories(self):
        files = os.listdir(os.path.join(self.input_path, 'train_simplified'))
        return sorted([f2cat(f) for f in files], key=str.lower)

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
        df = pd.read_csv(os.path.join(self.input_path, 'train_simplified', category + '.csv'),
                         nrows=nrows, parse_dates=['timestamp'], usecols=usecols)
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(json.loads)
        return df

In [3]:
start = dt.datetime.now()
s = Simplified('/home/k.vincent/apm_file/')
NCSVS = 100
categories = s.list_all_categories()
print(len(categories))

340


In [4]:
for y, cat in tqdm(enumerate(categories)):
    df = s.read_training_csv(cat, nrows=None)
    df['y'] = y
    df['cv'] = (df.key_id // 10 ** 7) % NCSVS
    for k in range(NCSVS):
        filename = 'train_k{}.csv'.format(k)
        chunk = df[df.cv == k]
        chunk = chunk.drop(['key_id'], axis=1)
        if y == 0:
            chunk.to_csv(filename, index=False)
        else:
            chunk.to_csv(filename, mode='a', header=False, index=False)


340it [23:06,  4.55s/it]


In [5]:
for k in tqdm(range(NCSVS)):
    filename = 'train_k{}.csv'.format(k)
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by='rnd').drop('rnd', axis=1)
        df.to_csv(filename + '.gz', compression='gzip', index=False)
        os.remove(filename)
print(df.shape)
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))

100%|██████████| 100/100 [4:48:30<00:00, 173.14s/it] 

(497740, 7)
Latest run 2018-12-02 04:30:47.906755.
Total time 18703s





## Simple Convolutional Neural Network

In [3]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import ast
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 14
import seaborn as sns
import cv2
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

start = dt.datetime.now()

In [2]:
DP_DIR = '/home/k.vincent/'
INPUT_DIR = '/home/k.vincent/apm_file/'
BASE_SIZE = 256
NCSVS = 100
NCATS = 340
np.random.seed(seed=1987)
tf.set_random_seed(seed=1987)
def f2cat(filename: str) -> str:
    return filename.split('.')[0]

def list_all_categories():
    files = os.listdir(os.path.join(INPUT_DIR, 'train_simplified'))
    return sorted([f2cat(f) for f in files], key=str.lower)

In [3]:
def apk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


def preds2catids(predictions):
    return pd.DataFrame(np.argsort(-predictions, axis=1)[:, :3], columns=['a', 'b', 'c'])

In [4]:
def custom_single_cnn(size, conv_layers=(8, 16, 32, 64), dense_layers=(512, 256), conv_dropout=0.2,
                      dense_dropout=0.2):
    model = Sequential()
    
    model.add( Conv2D(conv_layers[0], kernel_size=(3, 3), padding='same', activation='relu', input_shape=(size, size, 3)) )
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    
    
    for conv_layer_size in conv_layers[1:]:
        model.add(Conv2D(conv_layer_size, kernel_size=(3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
        if conv_dropout:
            model.add(Dropout(conv_dropout))

    model.add(Flatten())
    if dense_dropout:
        model.add(Dropout(dense_dropout))

    for dense_layer_size in dense_layers:
        model.add(Dense(dense_layer_size, activation='relu'))
        model.add(Activation('relu'))
        if dense_dropout:
            model.add(Dropout(dense_dropout))

    model.add(Dense(NCATS, activation='softmax'))
    return model

def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

def top3_acc( tgt, pred ):
    sc = np.mean( (pred[:,0]==tgt) | (pred[:,1]==tgt) | (pred[:,2]==tgt) )
    return sc

In [11]:
bi_factor = 4
STEPS = 500*bi_factor
size = 82
batchsize = 2048/bi_factor

model = custom_single_cnn(size=size,
                          conv_layers=[128, 128, 256],
                          dense_layers=[2048],
                          conv_dropout=False,
                          dense_dropout=0.10 )
model.compile(optimizer=Adam(lr=0.002), loss='categorical_crossentropy',
              metrics=[categorical_crossentropy, categorical_accuracy, top_3_accuracy])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 82, 82, 128)       3584      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 41, 41, 128)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 39, 39, 128)       147584    
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 19, 19, 128)       0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 17, 17, 256)       295168    
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 8, 8, 256)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16384)             0         
__________

In [12]:
colors = [(255, 0, 0) , (255, 255, 0),  (128, 255, 0),  (0, 255, 0), (0, 255, 128), (0, 255, 255), 
          (0, 128, 255), (0, 0, 255), (128, 0, 255), (255, 0, 255)]
def draw_cv2(raw_strokes, size=256, lw=6):
    img = np.zeros((BASE_SIZE, BASE_SIZE,3), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            color = colors[min(t, len(colors)-1)]
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]), (stroke[0][i + 1], stroke[1][i + 1]),/
                         color, lw, lineType=cv2.LINE_AA)
    if np.random.rand()>0.5:
        img = np.fliplr(img)
    if np.random.rand()>0.75:
        if np.random.rand()>0.50:
            img = img[ 4:, 4: ,:]
        else:
            img = img[ :-4, :-4 ,:]
    if np.random.rand()>0.50:
        img2 = cv2.resize(img, (200, 200))
        img = np.zeros((BASE_SIZE, BASE_SIZE,3), np.uint8)
        img[18:218,18:218, :] = img2

    if size != BASE_SIZE:
        return cv2.resize(img, (size, size))
    else:
        return img

def image_generator(size, batchsize, ks, lw=6):
    while True:
        for k in np.random.permutation(ks):
            filename = os.path.join(DP_DIR, 'train_k{}.csv.gz'.format(k))
            for df in pd.read_csv(filename, chunksize=batchsize):
                df['drawing'] = df['drawing'].apply(ast.literal_eval)
                x = np.zeros((len(df), size, size,3))
                for i, raw_strokes in enumerate(df.drawing.values):
                    #print(df.drawing.values)
                    #
                    x[i, :, :, :] = draw_cv2(raw_strokes, size=size, lw=lw)
                x = x / 255.
                x = x.reshape((len(df), size, size, 3)).astype(np.float32)
                y = keras.utils.to_categorical(df.y, num_classes=NCATS)
                yield x, y

def df_to_image_array(df, size, lw=6):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size,3))
    for i, raw_strokes in enumerate(df.drawing.values):
        #
        x[i, :, : ,:] = draw_cv2(raw_strokes, size=size, lw=lw)
    x = x / 255.
    x = x.reshape((len(df), size, size, 3)).astype(np.float32)
    return x

In [13]:
valid_df = pd.read_csv(os.path.join(DP_DIR, 'train_k{}.csv.gz'.format(NCSVS - 1)), nrows=30000)
x_valid = df_to_image_array(valid_df, size)
y_valid = keras.utils.to_categorical(valid_df.y, num_classes=NCATS)
print(x_valid.shape, y_valid.shape)
print('Validation array memory {:.2f} GB'.format(x_valid.nbytes / 1024.**3 ))

(30000, 82, 82, 3) (30000, 340)
Validation array memory 2.25 GB


In [14]:
train_datagen = image_generator(size=size, batchsize=batchsize, ks=range(NCSVS - 1))

Note that the model was trained by 2000 steps a epochs, 10 epochs a time. The model was trained for 10 additional epochs before the below training started. 

In [16]:
callbacks = [
    #EarlyStopping(monitor='val_top_3_accuracy', patience=15, min_delta=0.001, mode='max'),
    ReduceLROnPlateau(monitor='val_top_3_accuracy', factor=0.5, patience=5, min_delta=0.005, mode='max', cooldown=3),
    ModelCheckpoint("./RGBsim-1.model",monitor='val_top_3_accuracy', mode = 'max', save_best_only=True, verbose=1)
]
hist = model.fit_generator(
    train_datagen, steps_per_epoch=STEPS, epochs=10, verbose=1,
    validation_data=(x_valid, y_valid),
    callbacks = callbacks
)

model.save('Autosaved_CNN')

Epoch 1/10
Epoch 00001: val_top_3_accuracy improved from -inf to 0.79060, saving model to ./black-white-7.model
Epoch 2/10
Epoch 00002: val_top_3_accuracy improved from 0.79060 to 0.81837, saving model to ./black-white-7.model
Epoch 3/10
Epoch 00003: val_top_3_accuracy improved from 0.81837 to 0.82953, saving model to ./black-white-7.model
Epoch 4/10
Epoch 00004: val_top_3_accuracy improved from 0.82953 to 0.83480, saving model to ./black-white-7.model
Epoch 5/10
Epoch 00005: val_top_3_accuracy improved from 0.83480 to 0.84077, saving model to ./black-white-7.model
Epoch 6/10
Epoch 00006: val_top_3_accuracy improved from 0.84077 to 0.84730, saving model to ./black-white-7.model
Epoch 7/10
Epoch 00007: val_top_3_accuracy did not improve from 0.84730
Epoch 8/10
Epoch 00008: val_top_3_accuracy improved from 0.84730 to 0.84870, saving model to ./black-white-7.model
Epoch 9/10
Epoch 00009: val_top_3_accuracy improved from 0.84870 to 0.85303, saving model to ./black-white-7.model
Epoch 10/10

## Validate the accuracy and create submission

In [17]:
#Accuracy? 
valid_predictions1 = model.predict(x_valid, batch_size=128, verbose=1)
map3 = mapk(valid_df[['y']].values, preds2catids(valid_predictions1).values)
top3 = top3_acc(valid_df[['y']].values.flatten(), preds2catids(valid_predictions1).values)
print('Map3: {:.3f}'.format(map3))
print('Top3: {:.3f}'.format(top3))
print()

x_valid2 = np.array( [ np.fliplr(x_valid[i]) for i in range(x_valid.shape[0])] )
valid_predictions2 = model.predict(x_valid2, batch_size=128, verbose=1)
map3 = mapk(valid_df[['y']].values, preds2catids(valid_predictions2).values)
top3 = top3_acc(valid_df[['y']].values.flatten(), preds2catids(valid_predictions2).values)
print('Map3: {:.3f}'.format(map3))
print('Top3: {:.3f}'.format(top3))
print()

map3 = mapk(valid_df[['y']].values, preds2catids(0.5*valid_predictions1+0.5*valid_predictions2).values)
top3 = top3_acc(valid_df[['y']].values.flatten(), preds2catids(0.5*valid_predictions1+0.5*valid_predictions2).values)
print('Map3: {:.3f}'.format(map3))
print('Top3: {:.3f}'.format(top3))
print()

Map3: 0.761
Top3: 0.852

Map3: 0.746
Top3: 0.843

Map3: 0.772
Top3: 0.861



In [18]:
#Remove the file if you get OOM exception
try:
    del test
except:
    pass
try:
    del x_test
except:
    pass
try:
    del x_test2
except:
    pass
try:
    del test_predictions1
except:
    pass
try:
    del test_predictions2
except:
    pass
try:
    del test_predictions
except:
    pass
try:
    del top3
except:
    pass
try:
    del submission
except:
    pass

try:
    del top3cats
except:
    pass
try:
    del x_valid2
    del valid_predictions1
    del valid_predictions2
except:
    pass

try:
    del x_valid
    del y_valid
    del valid_df
    del x
    pass
except:
    pass

In [19]:
#Create Submission
#batched prediction due to memory constrain
#This is super slow but is memory-friendly
INPUT_DIR = '/home/k.vincent/'
test = pd.read_csv(os.path.join(INPUT_DIR, 'test_simplified.csv'))

test_predictions1 = np.zeros(shape=(0,340))
test_predictions2 = np.zeros(shape=(0,340))

for index, ob in test.groupby(np.arange(len(test))//8000):
    x_test = df_to_image_array(ob, size)
    x_test2 = np.array( [ np.fliplr(x_test[i]) for i in range(x_test.shape[0])] )
    
    temp_pred = model.predict(x_test, batch_size=128, verbose=1)
    test_predictions1 = np.concatenate((test_predictions1, temp_pred))
    
    temp_pred = model.predict(x_test2, batch_size=128, verbose=1)
    test_predictions2 = np.concatenate((test_predictions2, temp_pred))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy




In [20]:
#Prediction ensembling and write file
test_predictions = 0.5*test_predictions1 + 0.5*test_predictions2
INPUT_DIR = '/home/k.vincent/apm_file/'
top3 = preds2catids(test_predictions)
cats = list_all_categories()
id2cat = {k: cat.replace(' ', '_') for k, cat in enumerate(cats)}
top3cats = top3.replace(id2cat)
top3cats.head()
top3cats.shape

test['word'] = top3cats['a'] + ' ' + top3cats['b'] + ' ' + top3cats['c']
submission = test[['key_id', 'word']]
submission.to_csv('_submit'.format(int(map3 * 10**4)), index=False)
submission.head()
submission.shape

Unnamed: 0,a,b,c
0,radio,stereo,snorkel
1,hockey_puck,bottlecap,sandwich
2,castle,The_Great_Wall_of_China,camel
3,mountain,triangle,tent
4,campfire,fireplace,leaf


(112199, 3)

Unnamed: 0,key_id,word
0,9000003627287624,radio stereo snorkel
1,9000010688666847,hockey_puck bottlecap sandwich
2,9000023642890129,castle The_Great_Wall_of_China camel
3,9000038588854897,mountain triangle tent
4,9000052667981386,campfire fireplace leaf


(112199, 2)

In [21]:
#model.save('fx_CNN')#model = load_model('rdRGB_128to128_50e_CNN', custom_objects={'top_3_accuracy': top_3_accuracy})