In [0]:
!rm -r ./cleaned_kickstarted_dataset train_dataset_with_texts.csv

# Kickstarter project
## Model optimizing and training

In [0]:
GIT_DIR = 'cleaned_kickstarted_dataset'

TEST_SPLIT = 0.1
VALIDATION_SPLIT = 0.2

# variables for tokenizing regarding to word embedding
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [0]:
import numpy as np
np.random.seed(123)

import pandas as pd
import os

### Loading data from the GIT repo, extracting

In [0]:
!git clone https://github.com/Strongkong/cleaned_kickstarted_dataset

In [0]:
zip = os.path.join(GIT_DIR, 'train_dataset_with_texts.csv.zip')

!unzip $zip

### Loading GloVe dataset

In [6]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2018-12-09 22:14:24--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-12-09 22:14:24--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2018-12-09 22:16:42 (5.99 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [0]:
!unzip -p 'glove.6B.zip' 'glove.6B.100d.txt' > glove.6B.100d.txt

### Import packages

In [8]:
# We use Talos for hyperparameter optimization because of its great syntax and rich built in visualization tools.
!pip install talos

Collecting talos
  Downloading https://files.pythonhosted.org/packages/16/7e/eae6dc099c48cd663f61d569208799d6628ac0843be09d28f7f84d65a8d4/talos-0.4.3.tar.gz
Collecting astetik (from talos)
  Downloading https://files.pythonhosted.org/packages/fb/4a/17c487680c9f3a507da45013e2c1256ee4157f4d67b92e7995078eec914b/astetik-1.9.5.tar.gz
Collecting chances (from talos)
[?25l  Downloading https://files.pythonhosted.org/packages/f0/4e/85014772bbf026903080beecb36681dbceb28b14f96491f42673b95ddcf6/chances-0.1.1-py3-none-any.whl (52kB)
[K    100% |████████████████████████████████| 61kB 9.8MB/s 
[?25hCollecting kerasplotlib (from talos)
  Downloading https://files.pythonhosted.org/packages/e8/2e/b8628bfef6a817da9be863f650cf67187676b10d27d94b23f248da35d2b4/kerasplotlib-0.1.4.tar.gz
Collecting wrangle (from talos)
  Downloading https://files.pythonhosted.org/packages/a3/d4/4137b26b28500399d7f921e296a2346cfd8a8a693e6a3928a305b6568e7a/wrangle-0.3.1.tar.gz
Collecting geonamescache (from astetik->talos)


In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Embedding, BatchNormalization
from keras.layers import concatenate
from keras.initializers import Constant
from keras import regularizers
from keras.activations import relu, tanh, softmax
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD, Adam

from tensorflow import set_random_seed
set_random_seed(123)

from sklearn.preprocessing import StandardScaler

import talos
from talos.model import lr_normalizer, hidden_layers

Using TensorFlow backend.


In [0]:
# To check whether we use the GPU for training...

# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

### Load the dataset

In [0]:
# Import the final version of dataset which is ready to train the model on.
# It turns out that the hashingvectorized name and blurb data does not improve
# out model's accuracy, so we use the raw texts on another model later on.
df = pd.read_csv(filepath_or_buffer='train_dataset_with_texts.csv', sep='\t')

### Shuffling, identifying name, blurb and state columns

In [0]:
# Shuffle first
df = df.sample(frac=1).reset_index(drop=True)

### Define inputs and outputs

In [0]:
state_columns = df.columns[df.columns.str.startswith('state_')].values.tolist()

# We would like to predicate whether a kickstarter project will be successful.
# We can't use the backers and usd_pledged_real values, nor the final state as an input.
# Only keep numerical values --> throw out the name and blurb fields.
X1 = df.drop(state_columns + ['name', 'blurb', 'backers', 'usd_pledged_real'], axis=1)
# Train the second model on the text values. Delete anything else.
X2 = df[['name', 'blurb']]

# The ouptput will be the final state of the ks project
Y = df[df.columns.intersection(state_columns)]

In [14]:
print(X1.head())
print('------------------------------------------------')
print(X2.head())
print('________________________________________________')
print(Y.head())

   usd_goal_real  category_0  category_1  category_2  category_3  category_4  \
0        4.58674         0.0         0.0         0.0         0.0         0.0   
1        3.50000         0.0         0.0         0.0         0.0         0.0   
2       89.82228         0.0         0.0         0.0         0.0         0.0   
3        0.15000         0.0         0.0         0.0         0.0         0.0   
4        0.45966         0.0         0.0         0.0         0.0         0.0   

   category_5  category_6  category_7  category_8    ...     currency_5  \
0         0.0         0.0         0.0         0.0    ...            0.0   
1         0.0         0.0         0.0         0.0    ...            0.0   
2         0.0         0.0         0.0         0.0    ...            1.0   
3         0.0         0.0         0.0         0.0    ...            0.0   
4         0.0         0.0         0.0         0.0    ...            0.0   

   currency_6  currency_7  currency_8  currency_9  currency_10  curr

### Post-preprocessing :'(

In [15]:
# name fields
name_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
name_tokenizer.fit_on_texts(X2.name)
name_sequences = name_tokenizer.texts_to_sequences(X2.name)

name_word_index = name_tokenizer.word_index
print('Found %s unique name tokens.' % len(name_word_index))

# padding the sequences to make their length same
X2_name = pad_sequences(name_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 82866 unique name tokens.


In [16]:
# blurb fields
blurb_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
blurb_tokenizer.fit_on_texts(X2.blurb)
blurb_sequences = blurb_tokenizer.texts_to_sequences(X2.blurb)

blurb_word_index = blurb_tokenizer.word_index
print('Found %s unique blurb tokens.' % len(blurb_word_index))

# padding the sequences to make their length same
X2_blurb = pad_sequences(blurb_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 92961 unique blurb tokens.


### Split the dataset into train, test, valid part

In [0]:
# splitting
v_index = int(X1.shape[0] * (1-VALIDATION_SPLIT-TEST_SPLIT))
t_index = int(X1.shape[0] * (1-TEST_SPLIT))

X1_test = X1[t_index:]
X2_name_test = X2_name[t_index:]
X2_blurb_test = X2_blurb[t_index:]
Y_test = Y[t_index:]

X1_valid = X1[v_index:t_index]
X2_name_valid = X2_name[v_index:t_index]
X2_blurb_valid = X2_blurb[v_index:t_index]
Y_valid = Y[v_index:t_index]

X1_train = X1[:v_index]
X2_name_train = X2_name[:v_index]
X2_blurb_train = X2_blurb[:v_index]
Y_train = Y[:v_index]

# standardization to prevent saturation
scaler = StandardScaler().fit(X1_train)

X1_train = scaler.transform(X1_train)
X1_valid = scaler.transform(X1_valid)
X1_test = scaler.transform(X1_test)

Y_train = Y_train.values
Y_valid = Y_valid.values
Y_test = Y_test.values

## model1: train on numerical features

### Building up the optimization environment and run it

In [18]:
# Source: https://github.com/autonomio/talos/blob/master/talos/examples/models.py

# The talos parameters
# We use Adam optimizer to learn faster in the optimization section.
# Out loss_function is the categorical_crossentropy that's why we set the softmax to the last layer.
p = {
    'lr': (1e-2, 1, 5e-3),
    'epochs': [100],
    'first_neuron': [512, 1024, 2048],
    'hidden_layers': [0, 1, 2, 3],
    'dropout': [0.3, 0.4, 0.5],
    'activation': [relu, tanh, softmax],
    'last_activation': [softmax],
    'optimizer': [Adam],
    'batch_size': [64, 128, 256],
    'kernel_initializer': ['random_normal']
}

print("X1 Shape is {} Y Shape is {}".format(X1_train.shape[1],Y_train.shape[1]))


def do_training(X_train, Y_train, X_valid, Y_valid, params):
      # Small ES patient, no regularization... we are only wondering about which setting will be most effective in the first epochs

      es = EarlyStopping(monitor='val_acc' ,patience=5, mode='max')

      model = Sequential()
      model.add(Dense(params['first_neuron'], 
                      activation=params['activation'], 
                      input_dim=X_train.shape[1], 
                      use_bias=True, 
                      kernel_initializer=params['kernel_initializer']))
      model.add(Dropout(params['dropout']))

      hidden_layers(model, params, Y_train.shape[1])

      model.add(Dense(Y_train.shape[1], 
                      activation=params['last_activation'], 
                      kernel_initializer=params['kernel_initializer']))

      
      model.compile(optimizer=params['optimizer'](lr=lr_normalizer(params['lr'], params['optimizer'])),
                    loss='categorical_crossentropy', 
                    metrics=['accuracy'])

      history = model.fit(X_train, Y_train, 
                          epochs=params['epochs'], 
                          batch_size=params['batch_size'],
                          shuffle=True,
                          validation_data=(X_valid,Y_valid), 
                          callbacks=[es],
                          verbose=2)

      return history, model


# Commented out, because we have results from a previous run 

# t = talos.Scan(X1_train, Y_train,
#               params=p,
#               model=do_training)

X1 Shape is 212 Y Shape is 2


### Visualize the results of the optimization

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt

In [0]:
# On the result of the talos.Scan a CSV file will be created
# 120518222223_.csv contains 300 results of training 
r = talos.Reporting(os.path.join(GIT_DIR, '120518222223_.csv'))

In [21]:
print("The number of trainings: ", r.rounds())
print("The best val_acc: ", r.high())
print("The index of the best round which has the highest val_acc value: ", r.rounds2high())
print("Best parameters: \n")
print(r.best_params(n=5))

The number of trainings:  300
The best val_acc:  0.765295984093522
The index of the best round which has the highest val_acc value:  138
Best parameters: 

[[0 64 '<function relu at 0x7f2babd44bf8>' 0.01 2000 0.4
  "<class 'keras.optimizers.Adam'>" 2048 'random_normal'
  '<function softmax at 0x7f2babd44950>' 0]
 [0 128 '<function relu at 0x7f2babd44bf8>' 0.01 2000 0.3
  "<class 'keras.optimizers.Adam'>" 2048 'random_normal'
  '<function softmax at 0x7f2babd44950>' 1]
 [0 64 '<function relu at 0x7f2babd44bf8>' 0.01 2000 0.5
  "<class 'keras.optimizers.Adam'>" 2048 'random_normal'
  '<function softmax at 0x7f2babd44950>' 2]
 [1 64 '<function relu at 0x7f2babd44bf8>' 0.01 2000 0.3
  "<class 'keras.optimizers.Adam'>" 1024 'random_normal'
  '<function softmax at 0x7f2babd44950>' 3]
 [0 64 '<function relu at 0x7f2babd44bf8>' 0.01 2000 0.4
  "<class 'keras.optimizers.Adam'>" 1024 'random_normal'
  '<function softmax at 0x7f2babd44950>' 4]]


In [22]:
print("The correlation between val_acc and other params")
r.correlate('val_acc')

The correlation between val_acc and other params


hidden_layers   -1.063884e-02
batch_size      -6.059422e-04
lr              -3.336947e-16
epochs                    NaN
dropout         -1.466179e-02
first_neuron    -6.200544e-03
Name: val_acc, dtype: float64

In [0]:
# This plot shows val_acc vs val_loss
# On Google's Colab somehow matplotlib crashed here.
# On local machine, with talos-0.4.3 and matplotlib-3.0.2 it the plots are displayed correctly.
r.plot_regs()
plt.show()

In [0]:
# A heatmap which shows the correlations between val_acc and the other parameters
# On Google's Colab somehow matplotlib crashed here.
# On local machine, with talos-0.4.3 and matplotlib-3.0.2 it the plots are displayed correctly.
r.plot_corr()
plt.show()

### Training the model

In [0]:
# Based on the results of the optimization, we chose a set of parameters which could result high accuracy

# Setting up the early stopping and model checkpoint
es = EarlyStopping(monitor='val_acc', mode='max')
mcp = ModelCheckpoint(filepath='m1_weights.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# We use regularization and dropout to prevent overfitting
model1 = Sequential()
model1.add(Dense(2048, 
                activation=relu, 
                input_dim=X1_train.shape[1], 
                use_bias=True, 
                kernel_initializer='random_normal',
                kernel_regularizer=regularizers.l2(1e-6),
                activity_regularizer=regularizers.l1(1e-6)))
model1.add(Dropout(0.4))

model1.add(Dense(Y_train.shape[1], 
                activation=softmax, 
                kernel_initializer='random_normal'))

In [25]:
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2048)              436224    
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 4098      
Total params: 440,322
Trainable params: 440,322
Non-trainable params: 0
_________________________________________________________________


In [26]:
# Train with Adam optimizer for faster convergence for the first few epochs
# EarlyStopping stops immediately when the val_acc does not improve
adam = Adam(lr=1e-2, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model1.compile(optimizer=adam,
                loss='categorical_crossentropy', 
                metrics=['accuracy'])

history = model1.fit(X1_train, Y_train, 
                epochs=100, 
                batch_size=128,
                shuffle=True,
                validation_data=(X1_valid,Y_valid), 
                callbacks=[es, mcp],
                verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.70856, saving model to m1_weights.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.70856 to 0.72845, saving model to m1_weights.hdf5
Epoch 3/100

Epoch 00003: val_acc improved from 0.72845 to 0.73273, saving model to m1_weights.hdf5
Epoch 4/100

Epoch 00004: val_acc improved from 0.73273 to 0.73697, saving model to m1_weights.hdf5
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.73697


In [27]:
epochs = 1000

# Then continue with SGD for more precise results
sgd = SGD(lr=0.01, decay=0.01/epochs, momentum=0.9, nesterov=True) # decay=lr/epochs

# We are a littlebit more patient
es = EarlyStopping(monitor='val_acc', patience=5, mode='max')
model1.compile(optimizer=sgd,
                loss='categorical_crossentropy', 
                metrics=['accuracy'])

history = model1.fit(X1_train, Y_train, 
                epochs=epochs, 
                batch_size=128,
                shuffle=True,
                validation_data=(X1_valid,Y_valid), 
                callbacks=[es, mcp],
                verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/1000

Epoch 00001: val_acc improved from 0.73697 to 0.74121, saving model to m1_weights.hdf5
Epoch 2/1000

Epoch 00002: val_acc improved from 0.74121 to 0.74195, saving model to m1_weights.hdf5
Epoch 3/1000

Epoch 00003: val_acc improved from 0.74195 to 0.74397, saving model to m1_weights.hdf5
Epoch 4/1000

Epoch 00004: val_acc improved from 0.74397 to 0.74599, saving model to m1_weights.hdf5
Epoch 5/1000

Epoch 00005: val_acc improved from 0.74599 to 0.74696, saving model to m1_weights.hdf5
Epoch 6/1000

Epoch 00006: val_acc improved from 0.74696 to 0.74982, saving model to m1_weights.hdf5
Epoch 7/1000

Epoch 00007: val_acc did not improve from 0.74982
Epoch 8/1000

Epoch 00008: val_acc did not improve from 0.74982
Epoch 9/1000

Epoch 00009: val_acc did not improve from 0.74982
Epoch 10/1000

Epoch 00010: val_acc did not improve from 0.74982
Epoch 11/1000

Epoch 00011: val_acc did not improve from 0.74982


In [28]:
from keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error

# lets have a look how it performs...
model1 = load_model('m1_weights.hdf5')
preds = model1.predict(X1_test)
err = mean_absolute_error(Y_test, preds)

print("Error on test data: {}".format(err))

Error on test data: 0.3005053357424663


## model2 and model3: train on name and blurb variables

We got inspired by this article on keras.io: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [29]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [0]:
# Out model for embedded words
# This is the same for name and blurb fields
def get_model2(x):
  x = Conv1D(128, 5, activation='relu')(embedded_sequences)
  x = Conv1D(128, 5, activation='relu')(x)
  x = MaxPooling1D(2)(x)
  x = Conv1D(128, 5, activation='relu')(x)
  x = GlobalMaxPooling1D()(x)
  x = Dense(128, activation='relu')(x)
  
  return x

### Model for name field

In [0]:
# in the i. row the matrix contians the glove vector
# corresponding to the word in the tokenizer's
# word_index in the i. place
num_words = min(MAX_NUM_WORDS, len(name_word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in name_word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
# input shape=(MAX_SEQUENCE_LENGTH,) output shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [0]:
# Setting up the early stopping and model checkpoint
es = EarlyStopping(monitor='val_acc', mode='max')
mcp = ModelCheckpoint(filepath='m2_weights.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

preds = Dense(Y_train.shape[1], activation='softmax')(get_model2(embedded_sequences))

model2 = Model(sequence_input, preds)

In [33]:
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          2000100   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 92, 128)           82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 46, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 42, 128)           82048     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
__________

In [34]:
model2.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

history = model2.fit(X2_name_train, Y_train,
                    epochs=100,
                    batch_size=128,
                    shuffle=True,
                    validation_data=(X2_name_valid, Y_valid),
                    callbacks=[es, mcp],
                    verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.63059, saving model to m2_weights.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.63059 to 0.64476, saving model to m2_weights.hdf5
Epoch 3/100

Epoch 00003: val_acc improved from 0.64476 to 0.64728, saving model to m2_weights.hdf5
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.64728


In [35]:
from keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error

# lets have a look how it performs...
model2 = load_model('m2_weights.hdf5')
preds = model2.predict(X2_name_test)
err = mean_absolute_error(Y_test, preds)

print("Error on test data: {}".format(err))

Error on test data: 0.4188419323731339


### Model for blurb field

In [0]:
# in the i. row the matrix contians the glove vector
# corresponding to the word in the tokenizer's
# word_index in the i. place
num_words = min(MAX_NUM_WORDS, len(blurb_word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in blurb_word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
# input shape=(MAX_SEQUENCE_LENGTH,) output shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [0]:
# Setting up the early stopping and model checkpoint
es = EarlyStopping(monitor='val_acc', mode='max')
mcp = ModelCheckpoint(filepath='m3_weights.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

preds = Dense(Y_train.shape[1], activation='softmax')(get_model2(embedded_sequences))

model3 = Model(sequence_input, preds)

In [38]:
model3.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

history = model3.fit(X2_blurb_train, Y_train,
                    epochs=100,
                    batch_size=128,
                    shuffle=True,
                    validation_data=(X2_blurb_valid, Y_valid),
                    callbacks=[es, mcp],
                    verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.65209, saving model to m3_weights.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.65209 to 0.67845, saving model to m3_weights.hdf5
Epoch 3/100

Epoch 00003: val_acc improved from 0.67845 to 0.68373, saving model to m3_weights.hdf5
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.68373


In [39]:
from keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error

# lets have a look how it performs...
model3 = load_model('m3_weights.hdf5')
preds = model3.predict(X2_blurb_test)
err = mean_absolute_error(Y_test, preds)

print("Error on test data: {}".format(err))

Error on test data: 0.3993674789354539


## model4: connectig the above 3 models with a 4th one

In [0]:
from keras.models import load_model

# load the best models and make predictions on each split of the dataset
model1 = load_model('m1_weights.hdf5')
model1_train = model1.predict(X1_train)
model1_valid = model1.predict(X1_valid)
model1_test = model1.predict(X1_test)

model2 = load_model('m2_weights.hdf5')
model2_train = model2.predict(X2_name_train)
model2_valid = model2.predict(X2_name_valid)
model2_test = model2.predict(X2_name_test)

model3 = load_model('m3_weights.hdf5')
model3_train = model3.predict(X2_blurb_train)
model3_valid = model3.predict(X2_blurb_valid)
model3_test = model3.predict(X2_blurb_test)

In [0]:
# Setting up the early stopping and model checkpoint
es = EarlyStopping(monitor='val_acc', mode='max')
mcp = ModelCheckpoint(filepath='m4_weights.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# Creating the model
# Defining inputs for the three models
inp1 = Input(shape=(2, ))
inp2 = Input(shape=(2, ))
inp3 = Input(shape=(2, ))

# Concat the input tensors
cc = concatenate([inp1, inp2, inp3])

# Connect a dense layer to the input
x = Dense(512, 
    activation=relu, 
    use_bias=True, 
    kernel_initializer='random_normal',
    kernel_regularizer=regularizers.l2(1e-6),
    activity_regularizer=regularizers.l1(1e-6))(cc)
x = Dropout(0.4)(x)

x = Dense(Y_train.shape[1], 
    activation=softmax, 
    kernel_initializer='random_normal')(x)

model4 = Model(inputs=[inp1, inp2, inp3], outputs=x)

In [42]:
model4.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 2)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 2)            0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 2)            0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 6)            0           input_3[0][0]                    
                                                                 input_4[0][0]                    
          

In [43]:
# Train with Adam optimizer for faster convergence for the first few epochs
adam = Adam(lr=1e-2, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model4.compile(optimizer=adam,
                loss='categorical_crossentropy', 
                metrics=['accuracy'])

history = model4.fit([model1_train, model2_train, model3_train], Y_train, 
                epochs=100, 
                batch_size=128,
                shuffle=True,
                validation_data=([model1_valid, model2_valid, model3_valid],Y_valid), 
                callbacks=[es, mcp],
                verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.76820, saving model to m4_weights.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.76820


In [44]:
epochs = 1000

# Then continue with SGD for more precise results
sgd = SGD(lr=0.01, decay=0.01/epochs, momentum=0.9, nesterov=True) # decay=lr/epochs

es = EarlyStopping(monitor='val_acc', patience=5, mode='max')
model4.compile(optimizer=sgd,
                loss='categorical_crossentropy', 
                metrics=['accuracy'])

history = model4.fit([model1_train, model2_train, model3_train], Y_train, 
                epochs=epochs, 
                batch_size=128,
                shuffle=True,
                validation_data=([model1_valid, model2_valid, model3_valid],Y_valid), 
                callbacks=[es, mcp],
                verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/1000

Epoch 00001: val_acc did not improve from 0.76820
Epoch 2/1000

Epoch 00002: val_acc did not improve from 0.76820
Epoch 3/1000

Epoch 00003: val_acc improved from 0.76820 to 0.76887, saving model to m4_weights.hdf5
Epoch 4/1000

Epoch 00004: val_acc did not improve from 0.76887
Epoch 5/1000

Epoch 00005: val_acc did not improve from 0.76887
Epoch 6/1000

Epoch 00006: val_acc did not improve from 0.76887
Epoch 7/1000

Epoch 00007: val_acc did not improve from 0.76887
Epoch 8/1000

Epoch 00008: val_acc did not improve from 0.76887


In [45]:
from keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error

# loading the best models
model1 = load_model('m1_weights.hdf5')
model2 = load_model('m2_weights.hdf5')
model3 = load_model('m3_weights.hdf5')
model4 = load_model('m4_weights.hdf5')

# make predictions on the test split
preds1 = model1.predict(X1_test)
preds2 = model2.predict(X2_name_test)
preds3 = model3.predict(X2_blurb_test)

# lets have a look at how it performs
preds = model4.predict([preds1, preds2, preds3])
err = mean_absolute_error(Y_test, preds)

print("Error on test data: {}".format(err))

Error on test data: 0.28958811900189785
