In [1]:
!rm -r ./cleaned_kickstarted_dataset train_dataset.csv

rm: cannot remove './cleaned_kickstarted_dataset': No such file or directory
rm: cannot remove 'train_dataset.csv': No such file or directory


# Kickstarter project
## Model optimizing and training

In [0]:
import numpy as np
import pandas as pd
import os

### Loading data

In [0]:
GIT_DIR = 'cleaned_kickstarted_dataset'

In [4]:
!git clone https://github.com/Strongkong/cleaned_kickstarted_dataset

Cloning into 'cleaned_kickstarted_dataset'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 20 (delta 0), reused 3 (delta 0), pack-reused 16[K
Unpacking objects: 100% (20/20), done.


In [5]:
zip = os.path.join(GIT_DIR, 'train_dataset.csv.zip')

!unzip $zip

Archive:  cleaned_kickstarted_dataset/train_dataset.csv.zip
  inflating: train_dataset.csv       


### Import dataset

In [0]:
# Import the final version of dataset which contains only numeric values and ready to train the model on.
df = pd.read_csv(filepath_or_buffer='train_dataset.csv', sep=',', index_col=0)

### Import packages

In [7]:
# We use Talos for hyperparameter optimization because of its great syntax and rich built in visualization tools.
!pip install talos

Collecting talos
  Downloading https://files.pythonhosted.org/packages/16/7e/eae6dc099c48cd663f61d569208799d6628ac0843be09d28f7f84d65a8d4/talos-0.4.3.tar.gz
Collecting astetik (from talos)
  Downloading https://files.pythonhosted.org/packages/fb/4a/17c487680c9f3a507da45013e2c1256ee4157f4d67b92e7995078eec914b/astetik-1.9.5.tar.gz
Collecting chances (from talos)
[?25l  Downloading https://files.pythonhosted.org/packages/f0/4e/85014772bbf026903080beecb36681dbceb28b14f96491f42673b95ddcf6/chances-0.1.1-py3-none-any.whl (52kB)
[K    100% |████████████████████████████████| 61kB 5.8MB/s 
[?25hCollecting kerasplotlib (from talos)
  Downloading https://files.pythonhosted.org/packages/e8/2e/b8628bfef6a817da9be863f650cf67187676b10d27d94b23f248da35d2b4/kerasplotlib-0.1.4.tar.gz
Collecting wrangle (from talos)
  Downloading https://files.pythonhosted.org/packages/a3/d4/4137b26b28500399d7f921e296a2346cfd8a8a693e6a3928a305b6568e7a/wrangle-0.3.1.tar.gz
Collecting geonamescache (from astetik->talos)


In [8]:
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import regularizers
from keras.activations import relu, tanh, softmax
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD, Adam

from tensorflow import set_random_seed

from sklearn.preprocessing import StandardScaler

import talos
from talos.model import lr_normalizer, hidden_layers

set_random_seed(123)
np.random.seed(123)

Using TensorFlow backend.


In [0]:
# To check whether we use the GPU for training...

# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

### Shuffle, and define inputs and outputs

In [0]:
# Shuffle first
df = df.sample(frac=1).reset_index(drop=True)

# We would like to predicate whether a kickstarter project will be successful.
# Don't need the backers and usd_pledged_real values (we can't even know them), nor the final state...
state_columns = df.columns[df.columns.str.startswith('state_')].values.tolist()
X = df.drop(state_columns + ['backers', 'usd_pledged_real'], axis=1)

# The ouptput will be the final state of the ks project
Y = df[df.columns.intersection(state_columns)]

In [11]:
# have a look at them
print(X.tail())
print('------------------------------------------------------------------------')
print(Y.tail())

        usd_goal_real  category_0  category_1  category_2  category_3  \
148569        5.00000         0.0         0.0         0.0         0.0   
148570       49.50876         0.0         0.0         0.0         0.0   
148571       15.00000         0.0         0.0         0.0         0.0   
148572       38.00000         0.0         0.0         0.0         0.0   
148573        0.50000         0.0         0.0         0.0         0.0   

        category_4  category_5  category_6  category_7  category_8    ...      \
148569         0.0         0.0         0.0         0.0         0.0    ...       
148570         0.0         0.0         0.0         0.0         0.0    ...       
148571         0.0         0.0         0.0         0.0         0.0    ...       
148572         0.0         0.0         0.0         0.0         0.0    ...       
148573         0.0         0.0         0.0         0.0         0.0    ...       

        blurb_246  blurb_247  blurb_248  blurb_249  blurb_250  blurb_251  

### Split the dataset into train, test, valid part

In [0]:
# the ratio of the parts
test_split = 0.1
valid_split = 0.2

v_index = int(X.shape[0] * (1-valid_split-test_split))
t_index = int(X.shape[0] * (1-test_split))

X_test = X[t_index:]
Y_test = Y[t_index:]

X_valid = X[v_index:t_index]
Y_valid = Y[v_index:t_index]
X_train = X[:v_index]
Y_train = Y[:v_index]

# standardization to prevent saturation
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

# convert them to ndarray for talos
Y_test = Y_test.as_matrix()
Y_valid = Y_valid.as_matrix()
Y_train = Y_train.as_matrix()

### Building up the optimization environment and run it

In [13]:
# Source: https://github.com/autonomio/talos/blob/master/talos/examples/models.py

# The talos parameters
# We use Adam optimizer to learn faster in the optimization section.
# Out loss_function is the categorical_crossentropy that's why we set the softmax to the last layer.
p = {
    'lr': (1e-2, 1, 5e-3),
    'epochs': [100],
    'first_neuron': [512, 1024, 2048],
    'hidden_layers': [0, 1, 2, 3],
    'dropout': [0.3, 0.4, 0.5],
    'activation': [relu, tanh, softmax],
    'last_activation': [softmax],
    'optimizer': [Adam],
    'batch_size': [64, 128, 256],
    'kernel_initializer': ['random_normal']
}

print("X Shape is {} Y Shape is {}".format(X_train.shape[1],Y_train.shape[1]))


def do_training(X_train, Y_train, X_valid, Y_valid, params):
      # Small ES patient, no regularization... we are only wondering about which setting will be most effective in the first epochs

      es = EarlyStopping(monitor='val_acc' ,patience=5, mode='max')

      model = Sequential()
      model.add(Dense(params['first_neuron'], 
                      activation=params['activation'], 
                      input_dim=X_train.shape[1], 
                      use_bias=True, 
                      kernel_initializer=params['kernel_initializer']))
      model.add(Dropout(params['dropout']))

      hidden_layers(model, params, Y_train.shape[1])

      model.add(Dense(Y_train.shape[1], 
                      activation=params['last_activation'], 
                      kernel_initializer=params['kernel_initializer']))

      
      model.compile(optimizer=params['optimizer'](lr=lr_normalizer(params['lr'], params['optimizer'])),
                    loss='categorical_crossentropy', 
                    metrics=['accuracy'])

      history = model.fit(X_train, Y_train, 
                          epochs=params['epochs'], 
                          batch_size=params['batch_size'],
                          shuffle=True,
                          validation_data=(X_valid,Y_valid), 
                          callbacks=[es],
                          verbose=2)

      return history, model


# t = talos.Scan(X_train, Y_train,
#               params=p,
#               model=do_training)

X Shape is 724 Y Shape is 2


### Visualize the results of the optimization

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt

In [0]:
# On the result of the talos.Scan a CSV file will be created
# 120518222223_.csv contains 300 results of training 
r = talos.Reporting(os.path.join(GIT_DIR, '120518222223_.csv'))

In [16]:
print("The number of trainings: ", r.rounds())
print("The best val_acc: ", r.high())
print("The index of the best round which has the highest val_acc value: ", r.rounds2high())
print("Best parameters: ")
print("\t", r.best_params(n=1))

The number of trainings:  300
The best val_acc:  0.765295984093522
The index of the best round which has the highest val_acc value:  138
Best parameters: 
	 [[0 64 '<function relu at 0x7f2babd44bf8>' 0.01 2000 0.4
  "<class 'keras.optimizers.Adam'>" 2048 'random_normal'
  '<function softmax at 0x7f2babd44950>' 0]]


In [17]:
print("The correlation between val_acc and other params")
r.correlate('val_acc')

The correlation between val_acc and other params


hidden_layers   -1.063884e-02
batch_size      -6.059422e-04
lr              -3.336947e-16
epochs                    NaN
dropout         -1.466179e-02
first_neuron    -6.200544e-03
Name: val_acc, dtype: float64

In [0]:
# This plot shows val_acc vs val_loss
r.plot_regs()
plt.show()

In [0]:
# A heatmap which shows the correlations between val_acc and the other parameters
r.plot_corr()
plt.show()

### Training the model

In [0]:
# Based on the results of the optimization, we chose a set of parameters which could result high accuracy

# Setting up the early stopping and model checkpoint
es = EarlyStopping(monitor='val_acc' ,patience=5, mode='max')
mcp = ModelCheckpoint(filepath='weights.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# We use regularization and dropout to prevent overfitting
model = Sequential()
model.add(Dense(2048, 
                activation=relu, 
                input_dim=X_train.shape[1], 
                use_bias=True, 
                kernel_initializer='random_normal',
                kernel_regularizer=regularizers.l2(1e-6),
                activity_regularizer=regularizers.l1(1e-6)))
model.add(Dropout(0.4))

model.add(Dense(Y_train.shape[1], 
                activation=softmax, 
                kernel_initializer='random_normal'))


sgd = SGD(lr=0.01, decay=0.01/500, momentum=0.9, nesterov=True) # decay=lr/epochs
adam = Adam(lr=1e-2, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

In [25]:
# Train with Adam optimizer for faster convergence for the first few epochs
model.compile(optimizer=adam,
                loss='categorical_crossentropy', 
                metrics=['accuracy'])

history = model.fit(X_train, Y_train, 
                epochs=10, 
                batch_size=128,
                shuffle=True,
                validation_data=(X_valid,Y_valid), 
                callbacks=[es, mcp],
                verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.70624, saving model to weights.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.70624 to 0.72728, saving model to weights.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.72728 to 0.73549, saving model to weights.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.73549
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.73549
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.73549
Epoch 7/10

Epoch 00007: val_acc improved from 0.73549 to 0.73623, saving model to weights.hdf5
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.73623
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.73623
Epoch 10/10

Epoch 00010: val_acc improved from 0.73623 to 0.73754, saving model to weights.hdf5


In [26]:
# Then continue with SGD for more precise results
# Train with Adam optimizer for faster convergence for the first few epochs
model.compile(optimizer=sgd,
                loss='categorical_crossentropy', 
                metrics=['accuracy'])

history = model.fit(X_train, Y_train, 
                epochs=500, 
                batch_size=128,
                shuffle=True,
                validation_data=(X_valid,Y_valid), 
                callbacks=[es, mcp],
                verbose=1)

Train on 104001 samples, validate on 29715 samples
Epoch 1/500

Epoch 00001: val_acc improved from 0.73754 to 0.74481, saving model to weights.hdf5
Epoch 2/500

Epoch 00002: val_acc improved from 0.74481 to 0.74585, saving model to weights.hdf5
Epoch 3/500

Epoch 00003: val_acc did not improve from 0.74585
Epoch 4/500

Epoch 00004: val_acc improved from 0.74585 to 0.74915, saving model to weights.hdf5
Epoch 5/500

Epoch 00005: val_acc did not improve from 0.74915
Epoch 6/500

Epoch 00006: val_acc did not improve from 0.74915
Epoch 7/500

Epoch 00007: val_acc improved from 0.74915 to 0.75003, saving model to weights.hdf5
Epoch 8/500

Epoch 00008: val_acc improved from 0.75003 to 0.75033, saving model to weights.hdf5
Epoch 9/500

Epoch 00009: val_acc did not improve from 0.75033
Epoch 10/500

Epoch 00010: val_acc improved from 0.75033 to 0.75164, saving model to weights.hdf5
Epoch 11/500

Epoch 00011: val_acc did not improve from 0.75164
Epoch 12/500

Epoch 00012: val_acc did not improve