In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.utils
import random

# example topics
topics = ["change_artwork", "change_product", "change_size", "change_file", "blank"]

Choose W2V pretrained model.

*Loading this might take some time, also beware that loading big dataset requires 16-20 GB of RAM*

In [None]:
from gensim.models import KeyedVectors

# word_vectors = KeyedVectors.load_word2vec_format("../w2v/glove.6B.50d.txt", binary=False)
# EMBEDDING_DIM=50

word_vectors = KeyedVectors.load_word2vec_format('../w2v/GoogleNews-vectors-negative300.bin', binary=True)
EMBEDDING_DIM=300

Read data and split it into sets

In [None]:
data=pd.read_csv('data/my_data.csv')
set_blanks = lambda x: x if isinstance(x, str) else "blank"
data['topic'] = data['topic'].apply(set_blanks)
data.text=data.text.astype(str)
data.topic=data.topic.astype(str)
data = data[data.topic.notnull()]
data = sklearn.utils.shuffle(data)

data_train, data_valid_test = train_test_split(data, test_size=0.2, random_state=42)
data_valid, data_test = train_test_split(data_valid_test, test_size=0.5, random_state=42)
del(data_valid_test)

## Train data prepropcessing

Find topic that occurs most frequently

In [None]:
topic_dict = {}
for topic in topics:
    topic_dict[topic] = []

max_count = 0
for index, row in data_train.iterrows():
    topic_dict[row['topic']].append(row['text'])
    new_length = len(topic_dict[row['topic']])
    if new_length > max_count:
        max_count = new_length
max_count

Generate sentences for other topics so number of examples is of the same length.

When generative replace half of words with random synonyms. 

**THIS IS SLOW**

In [None]:
for topic in topics:
    print(topic)
    for i in range(max_count - len(topic_dict[topic])):
        print(i)
        random_text = random.choice(topic_dict[topic])
        words = random_text.split(" ")
        new_words = []
        for word in words:
            word = word.lower()
            try:
                if random.uniform(0, 1) > 0.5:
                    word = random.choice(word_vectors.most_similar(positive=[word])[0:3])[0]
                new_words.append(word)
            except KeyError:
                new_words.append(word)
        random_text = ' '.join(new_words)
        data_train.loc[data_train.index.max() + 1] = [0, 0, topic, random_text]
    
data_train = sklearn.utils.shuffle(data_train)
print(data_train.head(10))


Add generated data to train dataset

In [None]:
data = data_train.append(data_valid).append(data_test)

topic_count = {}
for topic in topics:
    topic_count[topic] = 0

for index, row in data.iterrows():
    topic_count[row['topic']] += 1
    
print(topic_count)

print(data.shape, data_train.shape, data_valid.shape, data_test.shape)

Either save or load generated data

In [None]:
import pickle

# SAVE
# with open("data_2.pickle","wb") as f:
#     pickle.dump(data, f)
# with open("data_train_2.pickle","wb") as f:
#     pickle.dump(data_train, f)
# with open("data_valid_2.pickle","wb") as f:
#     pickle.dump(data_valid, f)
# with open("data_test_2.pickle","wb") as f:
#     pickle.dump(data_test, f)
    
# LOAD
with open("data_2.pickle","rb") as f:
    data = pickle.load(f)
with open("data_train_2.pickle","rb") as f:
    data_train = pickle.load(f)
with open("data_valid_2.pickle","rb") as f:
    data_valid = pickle.load(f)
with open("data_test_2.pickle","rb") as f:
    data_test = pickle.load(f)

## Process data to feed into the network

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

dic={}
for i, topic in enumerate(topics):
    dic[topic]=i
train_labels=data_train.topic.apply(lambda x:dic[x])
valid_labels=data_valid.topic.apply(lambda x:dic[x])
test_labels=data_test.topic.apply(lambda x:dic[x])
print(train_labels.shape, valid_labels.shape, test_labels.shape)

In [None]:
texts=data.text
print(texts.shape)

NUM_WORDS = 20000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='"#?$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(texts)
sequences_train = tokenizer.texts_to_sequences(data_train.text)
sequences_valid=tokenizer.texts_to_sequences(data_valid.text)
sequences_test=tokenizer.texts_to_sequences(data_test.text)
print(sequences_valid[0:2])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X_train = pad_sequences(sequences_train)
X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
y_train = to_categorical(np.asarray(train_labels[data_train.index]))
y_val = to_categorical(np.asarray(valid_labels[data_valid.index]))
y_test = to_categorical(np.asarray(test_labels[data_test.index]))
print('Shape of train X and label tensor:', X_train.shape,y_train.shape)
print('Shape of validation X and label tensor:', X_val.shape,y_val.shape)
print('Shape of test X and label tensor:', X_test.shape,y_test.shape)
y_train[0:10]

## Model

Some of the layers mad be deleted, not sure if they add any benefits. In the original paper their was no dropout after CNN layer, for activation a ReLU was used instead of LeakyReLU, and there was no batch norm.

In [None]:
from keras.layers import Dense, Input, GlobalMaxPooling1D, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding, LeakyReLU
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping, TensorBoard, ReduceLROnPlateau
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
from keras.callbacks import ModelCheckpoint, Callback
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras import backend as K
from keras.layers import Embedding

sequence_length = X_train.shape[1]

vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

def create_model(lr = 0.01, num_filters = 30, drop1 = 0.1, drop = 0.5, kernel_regularizer = 0.01, alpha = 0.1, filter_sizes=[3,4,5,5]):    
    K.clear_session()
    inputs = Input(shape=(sequence_length,))
    embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix.copy()],
                            trainable=True)
    embedding = embedding_layer(inputs)
    reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

    all_maxpool = []
    for i, filter_size in enumerate(filter_sizes):
        conv = Conv2D(num_filters, (filter_sizes[i], EMBEDDING_DIM),kernel_regularizer=regularizers.l2(kernel_regularizer))(reshape)
        activation = LeakyReLU(alpha=alpha)(conv)
        normal = BatchNormalization()(activation)
        dropout = Dropout(drop1)(normal)
        maxpool = MaxPooling2D((sequence_length - filter_sizes[i] + 1, 1), strides=(1,1))(dropout)
        all_maxpool.append(maxpool)
    merged_tensor = concatenate(all_maxpool, axis=1)
    flatten = Flatten()(merged_tensor)
    dropout = Dropout(drop)(flatten)
    output = Dense(units=len(topics), activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

    model = Model(inputs, output)
    adam = Adam(lr=lr)
    model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])
    return model


Different callbacks, most of the code is commented out.

In [None]:
# filepath="checkpoints/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
filepath = "checkpoints/weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# tensorboard = TensorBoard(log_dir='./Graph', histogram_freq=5, write_grads=True, batch_size=50, write_images=True, embeddings_freq=5)
# tensorboard = TensorBoard(log_dir='./Graph', histogram_freq=5, write_grads=True, batch_size=50)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', patience=10),
# callbacks = [early_stopping checkpoint, reduce_lr]
callbacks = [early_stopping]

Perform grid search for suitable hyperparams.

Each new param: double the training time, be careful.

Probably it's better to iterate over one param values and then go to the next one, it's faster but requires human supervision.

In [None]:
param_grid = dict(
    epochs=[100],
    batch_size=[50, 500], # 554
    lr=[0.001, 0.0001],
    num_filters=[50, 100],
    drop1=[0, 0.5],
    drop=[0.5], 
    kernel_regularizer=[0.01, 3], 
    alpha=[0.1], 
    filter_sizes=[[2, 4, 5]]
)
# LOL
results = []
for epochs in param_grid['epochs']:
    for batch_size in param_grid['batch_size']:
        for lr in param_grid['lr']:
            for num_filters in param_grid['num_filters']:
                for drop1 in param_grid['drop1']:
                    for drop in param_grid['drop']:
                        for kernel_regularizer in param_grid['kernel_regularizer']:
                            for alpha in param_grid['alpha']:
                                for filter_sizes in param_grid['filter_sizes']:
                                    model = create_model(
                                        lr = lr, 
                                        num_filters = num_filters,
                                        drop1 = drop1,
                                        drop = drop,
                                        kernel_regularizer = kernel_regularizer,
                                        alpha = alpha, 
                                        filter_sizes=filter_sizes
                                    )
                                    current_params = {
                                        'lr': lr, 
                                        'batch_size': batch_size,
                                        'num_filters': num_filters,
                                        'drop1': drop1,
                                        'drop': drop,
                                        'kernel_regularizer': kernel_regularizer,
                                        'alpha': alpha, 
                                        'filter_sizes': filter_sizes
                                    }
                                    print(current_params)
                                    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_val, y_val),
                                             callbacks=callbacks)  # starts training
                                    results.append({
                                        'params': current_params,
                                        'train': model.evaluate(X_train, y_train),
                                        'valid': model.evaluate(X_val, y_val),
                                    })

Sort hyper params by the end result

In [None]:
results.sort(key=lambda x: x['valid'][1], reverse=True)
results

If you want to train without grid search and use it

In [None]:
model = create_model(
    lr = 0.0001, 
    num_filters = 20,
    drop1 = 0.6,
    drop = 0.6,
    kernel_regularizer = 3,
    alpha = 0.1, 
    filter_sizes=[2, 4, 5]
)
model.fit(X_train, y_train, batch_size=50, epochs=100, verbose=1, validation_data=(X_val, y_val))

If you want to load model from saved weights

In [None]:
model = create_model(
    lr = 0.0001, 
    num_filters = 20,
    drop1 = 0.6,
    drop = 0.6,
    kernel_regularizer = 3,
    alpha = 0.1, 
    filter_sizes=[2, 4, 5]
)
adam = Adam(lr=1e-3)
model.load_weights("checkpoints/weights.best.hdf5")
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

See how trained model behaves on test data

In [None]:
sequences_test=tokenizer.texts_to_sequences(data_test.text)
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
scores = model.evaluate(X_test, y_test)
print(scores[1] * 100)

See model in action

In [None]:
def pred_to_text(predictions, dic):
    results = []
    for prediction in predictions:
        result_map = {}
        for key in dic:
            result_map[key] = prediction[dic[key]]
        results.append(result_map)
    return results

In [None]:
my_text = "Please change the artwork"
sequences_test=tokenizer.texts_to_sequences([my_text])
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
y_pred=model.predict(X_test)
pred_to_text(y_pred, dic)