# Deep Neural Network with Categorical Features
In this tutorial, we are going to implement a neural network using continuous and categorical features to classify Titanic Kaggle dataset.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.contrib import learn

## The Data
We are going to  use the Titanic Kaggle data to predict whether or not the passenger will survive based on certain attributes like age, gender, passenger class and the fare they paid etc. For more information on this data set check out here at [Kaggle](https://www.kaggle.com/c/titanic/data).
First off we’re going to define all of our columns as 'continuous' or 'categorical'.
* <b>Continuous columns</b> — any numerical value in a continuous range. Pretty much if it is a numerical representation like money, or age.
* <b>Categorical columns</b> — part of a finite set. Like male or female, or even what country someone is from.

In [2]:
data = pd.read_csv('data/titanic.csv')
continuous_vars = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_vars = ['Pclass', 'Gender', 'Embarked']

x = data[continuous_vars + categorical_vars]
y = np.asarray(data.pop('Survived'))

In [3]:
dim_emb = 8
dim_in = len(continuous_vars) + dim_emb * len(categorical_vars)
dim_h = 100
n_classes = 2

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
# Process categorical variables into ids.
x_train = x_train.copy()
x_test = x_test.copy()
categorical_var_encoders = {}
for var in categorical_vars:
    le = LabelEncoder().fit(x_train[var])
    x_train[var + '_id'] = le.transform(x_train[var])
    x_test[var + '_id'] = le.transform(x_test[var])
    x_train.pop(var)
    x_test.pop(var)
    categorical_var_encoders[var] = le

In [6]:
def fully_connected(x, dim_in, dim_out, name):
    with tf.variable_scope(name):
        # create variables
        w = tf.get_variable('w', shape=[dim_in, dim_out], initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
        b = tf.get_variable('b', shape=[dim_out])

        # create operations
        out = tf.matmul(x, w) + b

        return out


In [7]:
# Create model
def neural_network(x, dim_in=dim_in, dim_h=dim_h, dim_out=n_classes):
    # append continuous variables
    final_features = []
    final_features.append(x[:, :4])
    
    # embed categorical variables into distributed representation
    for i, var in enumerate(categorical_vars):
        feature = learn.ops.categorical_variable(tf.cast(x[:, i+4], tf.int64), 
                                                 len(categorical_var_encoders[var].classes_),
                                                 embedding_size=dim_emb, 
                                                 name=var)
        final_features.append(feature)

    final_features = tf.concat(1, final_features)


    # lst hidden layer with ReLU
    h1 = fully_connected(final_features, dim_in, dim_h, 'h1')
    h1 = tf.nn.relu(h1)

    # 2nd hidden layer with ReLU
    h2 = fully_connected(h1, dim_h, dim_h, 'h2')
    h2 = tf.nn.relu(h2)

    # output layer with linear
    out = fully_connected(h2, dim_h, dim_out, 'out')
    
    return out

In [8]:
x = tf.placeholder(tf.float32, [None, 7])
y = tf.placeholder(tf.int64, [None])

In [9]:
# Construct model with default value
out = neural_network(x)

In [10]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)

In [11]:
for var in tf.all_variables():
    print var.op.name, var.get_shape()

Pclass/Pclass_embeddings (3, 8)
Gender/Gender_embeddings (2, 8)
Embarked/Embarked_embeddings (3, 8)
h1/w (28, 100)
h1/b (100,)
h2/w (100, 100)
h2/b (100,)
out/w (100, 2)
out/b (2,)


In [12]:
# loss and optimizer
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(out, y))
train_op = tf.train.RMSPropOptimizer(learning_rate=0.003).minimize(loss)

# Test model
pred = tf.argmax(out, 1)

correct_pred = tf.equal(pred, y)
incorrect_pred = tf.not_equal(pred, y)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [13]:
batch_size = 100
num_epochs = 500

# launch the graph
with tf.Session() as sess:
    # initialize tensor variables
    tf.initialize_all_variables().run()
    # training cycle
    for epoch in range(num_epochs):
        avg_loss = 0.
        n_iters_per_epoch = int(x_train.shape[0]/ batch_size)
        # loop over all batches
        for i in range(n_iters_per_epoch):
            x_batch, y_batch = x_train[i*batch_size:(i+1)*batch_size], y_train[i*batch_size:(i+1)*batch_size]
            # run optimization op (backprop) and loss op (to get loss value)
            _, c = sess.run([train_op, loss], feed_dict={x: x_batch, y: y_batch})
            # compute average loss
            avg_loss += c / n_iters_per_epoch
        
        if epoch % 100 == 0:
            print "Epoch %d, Loss: %.3f"% (epoch+1, avg_loss)
    print "Finished training!"
    
    print "\nTrain accuracy:", sess.run(accuracy, {x: x_train, y: y_train})
    
    print "\nTest accuracy:", sess.run(accuracy, {x: x_test, y: y_test})

Epoch 1, Loss: 0.769
Epoch 101, Loss: 0.314
Epoch 201, Loss: 0.265
Epoch 301, Loss: 0.244
Epoch 401, Loss: 0.241
Epoch 501, Loss: 0.253
Epoch 601, Loss: 0.263
Epoch 701, Loss: 0.201
Epoch 801, Loss: 0.231
Epoch 901, Loss: 0.192
Finished training!

Train accuracy: 0.914326

Test accuracy: 0.804469
