# Deep Neural Network with Categorical Features
In this tutorial, we are going to implement a neural network using continuous and categorical features to classify Titanic Kaggle dataset.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.contrib import learn

## Data preprocessing
We are going to  use the Titanic Kaggle data to predict whether or not the passenger will survive based on certain attributes such as age, gender, passenger class and the fare they paid etc. For more information on this data set check out here at [Kaggle](https://www.kaggle.com/c/titanic/data).
First, we are going to define all of our featyres as 'continuous' or 'categorical'.

In [2]:
data = pd.read_csv('data/titanic.csv')
continuous_features = ['Age', 'Parch', 'SibSp', 'Fare']
categorical_features = ['Pclass', 'Gender', 'Embarked']

x = data[continuous_features + categorical_features]
y = np.asarray(data.pop('Survived'))

In [3]:
dim_emb = 8
dim_in = len(continuous_features) + dim_emb * len(categorical_features)
dim_h = 64
n_classes = 2

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

In [5]:
# Process categorical variables into ids.
x_train = x_train.copy()
x_test = x_test.copy()
num_class = {}
for var in categorical_features:
    le = LabelEncoder().fit(x_train[var])
    x_train[var + '_id'] = le.transform(x_train[var])
    x_test[var + '_id'] = le.transform(x_test[var])
    x_train.pop(var)
    x_test.pop(var)
    num_class[var] = len(le.classes_)

## Define and build model
we will develop a feed forward neural network with embedding features.

In [6]:
def fully_connected(x, dim_in, dim_out, name):
    with tf.variable_scope(name):
        # create variables
        w = tf.get_variable('w', shape=[dim_in, dim_out], initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
        b = tf.get_variable('b', shape=[dim_out])

        # create operations
        out = tf.matmul(x, w) + b

        return out


In [7]:
def embedding(x, num_class, dim_emb, name):
    # data type casting
    x = tf.cast(x, tf.int64)
    
    with tf.variable_scope(name):
        # embedding matrix
        w = tf.get_variable('w', shape=[num_class, dim_emb], initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
        
        out = tf.nn.embedding_lookup(w, x)
    
        return out

In [8]:
# Create model
def neural_network(x, dim_in=dim_in, dim_h=dim_h, dim_out=n_classes):
    # append continuous variables
    all_features = []
    
    # append continuous features (Age, SibSp, Parch, Fare)
    all_features.append(x[:, :len(continuous_features)])     
    
    # embed categorical variables into continuoues vector space
    for i, var in enumerate(categorical_features):
        feature = embedding(x[:, i+len(continuous_features)], num_class[var], dim_emb, name=var)
        all_features.append(feature)
    
    all_features = tf.concat(1, all_features)

    # lst hidden layer with ReLU
    h1 = fully_connected(all_features, dim_in, dim_h, 'h1')
    h1 = tf.nn.relu(h1)

    # 2nd hidden layer with ReLU
    h2 = fully_connected(h1, dim_h, dim_h, 'h2')
    h2 = tf.nn.relu(h2)

    # output layer with linear
    out = fully_connected(h2, dim_h, dim_out, 'out')
    
    return out

In [9]:
x = tf.placeholder(tf.float32, [None, 7])
y = tf.placeholder(tf.int64, [None])

# Construct model with default value
out = neural_network(x)

In [10]:
# loss and optimizer
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(out, y))
train_op = tf.train.RMSPropOptimizer(learning_rate=0.002).minimize(loss)

# Test model
pred = tf.argmax(out, 1)
correct_pred = tf.equal(pred, y)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

## Check variables

In [None]:
for var in tf.trainable_variables():
    print var.op.name, var.get_shape()

Pclass/w (3, 8)
Gender/w (2, 8)
Embarked/w (3, 8)
h1/w (28, 64)
h1/b (64,)
h2/w (64, 64)
h2/b (64,)
out/w (64, 2)
out/b (2,)


## Train and test the model

In [None]:
batch_size = 100
num_epoch = 500
num_ter_per_epoch = int(x_train.shape[0]/ batch_size)

# convert pandas dataframe to numpy array 
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)

# launch the graph
with tf.Session() as sess:
    # initialize tensor variables
    tf.initialize_all_variables().run()
    # training cycle
    for e in range(num_epoch):
        avg_loss = 0.
        
        # loop over all batches
        for i in range(num_ter_per_epoch):
            x_batch, y_batch = x_train[i*batch_size:(i+1)*batch_size], y_train[i*batch_size:(i+1)*batch_size]
            # run optimization op (backprop) and loss op (to get loss value)
            _, c = sess.run([train_op, loss], feed_dict={x: x_batch, y: y_batch})
            # compute average loss
            avg_loss += c / num_ter_per_epoch
        
        if e % 100 == 0:
            print "Epoch %d, Loss: %.3f"% (e+1, avg_loss)
    print "Finished training!"
    
    print "\nTrain accuracy:", sess.run(accuracy, {x: x_train, y: y_train})
    
    print "\nTest accuracy:", sess.run(accuracy, {x: x_test, y: y_test})

Epoch 1, Loss: 0.886
Epoch 101, Loss: 0.344
Epoch 201, Loss: 0.296
Epoch 301, Loss: 0.279
Epoch 401, Loss: 0.261
