In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import sys

In [None]:
import keras 
from keras.models import Sequential 
from keras.models import Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Conv2D, MaxPooling2D, Input, Merge, Activation, Dropout
from sklearn.model_selection import train_test_split
from random import shuffle
%matplotlib inline  

In [None]:
import matplotlib.pyplot as plt

### Load data

In [None]:
with open("data/xtrain_obfuscated.txt", "r") as f:
    data_train_x = f.read()
data_train_x = data_train_x.split("\n")

with open("data/ytrain.txt", "r") as f:
    data_train_y = f.read()
data_train_y = data_train_y.split("\n")

In [None]:
data_train_x = data_train_x[:-1]
data_train_y = data_train_y[:-1]

### Get max length

In [None]:
len(data_train_x)

In [None]:
lengths = [len(i) for i in data_train_x]

In [None]:
input_width = max(lengths)

In [None]:
input_width

### Pad short sequences

In [None]:
data_train_x_norm = []

for i in data_train_x:
    padding_length = input_width - len(i)
    padding = "".join(["w"] * padding_length)
    padded = i + padding
    data_train_x_norm.append(padded)

### Split data

In [None]:
x_train, x_eval, y_train, y_eval = train_test_split(data_train_x_norm, data_train_y, 
                                                    test_size=0.1, random_state=2017)

In [None]:
print (len(x_train))
print (len(x_eval))
print (len(y_train))
print (len(y_eval))

### Get reference characters

In [None]:
# get unique characters
characters = list(set([i for text in data_train_x_norm for i in text]))
characters.sort()
characters = np.array(characters)

In [None]:
print (characters)

In [None]:
characters.shape

In [None]:
number_of_classes=12

### Create Model

In [None]:
def createModel(input_width=452, input_height=26, number_of_classes=12, channel = 1):  
    #variable initialization 
    nb_filters =32
    kernel_size= {}
    kernel_size[0]= 3
    kernel_size[1]= 4
    kernel_size[2]= 5
    input_shape=(452, 26, 1)
    pool_size = (5,23)
    nb_classes =12
    no_parallel_filters = 3  
    
    inp = Input(shape=(input_width, input_height))

    # create seperate model graph for parallel processing with different filter sizes
    # apply 'same' padding so that ll produce o/p tensor of same size for concatination
    # cancat all paralle output

    convs = []
    for k_no in range(len(kernel_size)):
        conv = Conv1D(nb_filters, kernel_size[k_no],
                        border_mode='same',
                             activation='relu',
                        input_shape= input_shape)(inp)
        convs.append(conv)

    if len(kernel_size) > 1:
        out = Merge(mode='concat')(convs)
    else:
        out = convs[0]

    conv_model = Model(input=inp, output=out)

    # add created model graph in sequential model
    model = Sequential()
    model.add(conv_model)        # add model just like layer
    model.add(Conv1D(nb_filters, kernel_size[1]))
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(Dropout(0.25))
    model.add(Flatten(input_shape=input_shape))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model              

In [None]:
#model5 has the highest performance 
model = createModel()

In [None]:
model.summary()

### Train

In [None]:
def encodeInputText(text, characters):
    """  """
    encoded_text = []
    for char in text:
        char_vector = np.zeros(characters.shape[0])
        char_vector[characters == char] = 1
        encoded_text.append(char_vector)

    encoded_text = np.array(encoded_text)
    return encoded_text

In [None]:
def encodeInputLabel(raw_label, number_of_classes):
    """ convert class id to one hot vector """
    encoded_label = np.zeros(number_of_classes)
    encoded_label[int(raw_label)] = 1.0

    return encoded_label

In [None]:
def generateBatch(x_train, y_train, characters, number_of_classes, batch_size=64):
    i = 0
    while i < len(x_train):
        # get batch data
        x_batch_raw = x_train[i : i + batch_size]
        y_batch_raw = y_train[i : i + batch_size]
        
        # encode to numpy tensor
        x_batch = np.array([encodeInputText(text, characters) for text in x_batch_raw])
        y_batch = np.array([encodeInputLabel(label, number_of_classes) for label in y_batch_raw]) 
        
        i += batch_size
        
        yield x_batch, y_batch
        

In [None]:
for epoch in range(epochs):
    print ("+++++++++++++++++++++++++++++++++++++")
    print ("Epoch: {}".format(epoch))
    
    # shuffle data at the beginning of each epoch
    indices = [i for i in range(len(x_train))]
    shuffle(indices)
    x_train = [x_train[sample_index] for sample_index in indices]
    y_train = [y_train[sample_index] for sample_index in indices]
    
    # train for single epoch
    print ("Training ...")
    for batch_count, (x_batch, y_batch) in enumerate(generateBatch(x_train, y_train, characters, number_of_classes)):
        sys.stdout.write("\rFinished {} batches".format(batch_count))
        sys.stdout.flush()
        model.train_on_batch(x=x_batch, y=y_batch)
    
    # evaluate on eval
    print ("Evaluating on eval set")
    x_eval_enc = np.array([encodeInputText(text, characters) for text in x_eval])
    y_eval_enc = np.array([encodeInputLabel(label, number_of_classes) for label in y_eval])
    
    results = model.evaluate(x_eval_enc, y_eval_enc)
    print ("")
    print ("Eval set loss: {}".format(results[0]))
    print ("Eval set accuracy: {}".format(results[1]))
    

### Predict on Test set

In [None]:
# load data
with open("xtest_obfuscated.txt", "r") as f:
    data_test = f.read()
data_test = data_test.split("\n")


In [None]:
# remove empty line at the end
data_test = data_test[:-1]

In [None]:
# pad short text 
data_test_norm = []
for i in data_test:
    padding_length = input_width - len(i)
    padding = "".join(["w"] * padding_length)
    padded = i + padding
    data_test_norm.append(padded)

In [None]:
# encode text into numpy array
x_test = np.array([encodeInputText(text, characters) for text in data_test_norm])

In [None]:
# predict
predictions = model.predict(x_test)

In [None]:
# get the indices of the max score for each row
novel_ids = predictions.argmax(axis=1)

In [None]:
novel_ids[0:10]

In [None]:
file_name = 'y_test.txt' 
def save_labels(file_name,novel_ids):  
    labels = open(file_name, 'w') 
    for item in novel_ids: 
      labels.write("%s\n" % item) 
save_labels(file_name, novel_ids) 
