In [1]:
import pandas as pd
import numpy as np
import csv as csv
import tensorflow as tf

In [2]:
# Data cleanup
# TRAIN DATA
train_df = pd.read_csv('data/train.csv', header=0)        # Load the train file into a dataframe

# I need to convert all strings to integer classifiers.
# I need to fill in the missing values of the data and make it complete.

# female = 0, Male = 1
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# Embarked from 'C', 'Q', 'S'
# Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc.

# All missing Embarked -> just make them embark from most common place
if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
    train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values

Ports = list(enumerate(np.unique(train_df['Embarked'])))    # determine all values of Embarked,
Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index
train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int

# All the ages with no data -> make the median of all Ages
median_age = train_df['Age'].dropna().median()
if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
    train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age

# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

# print train_df.head()

labels = train_df['Survived']
train_df = train_df.drop('Survived',axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
def normalize_data(dataframe):
    std = dataframe.std(0)
    mean = dataframe.mean(0)
    return (dataframe - mean ) / std

train_df = normalize_data(train_df)
# print normalize_data(train_df).head

In [4]:
input_size = 7
num_labels = 2
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
print labels.shape

(891, 2)


In [5]:
""" Divide into training set and validation set"""
train_size = 600

train_df = train_df.values.astype(np.float32)

def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels

train_df, labels = randomize(train_df, labels)

valid_dataset = train_df[train_size::]
train_dataset = train_df[0:train_size]

valid_labels = labels[train_size::]
train_labels = labels[0:train_size]

In [6]:
# print train_dataset.shape, train_labels.shape
# print valid_dataset.shape, valid_labels.shape

print train_dataset[0:5]
print train_labels[0:5]

[[-1.56522787  2.7372694  -0.47427881  0.76719898  0.59917361 -1.94121289
   0.73728102]
 [-1.56522787 -0.10457867 -0.47427881 -0.47340772 -0.12484967  0.58562523
   0.73728102]
 [-0.36915749  1.20113528 -0.47427881 -0.47340772 -0.37639198  0.58562523
  -1.35481262]
 [ 0.82691282 -0.64222562 -0.47427881 -0.47340772 -0.49411377  0.58562523
  -1.35481262]
 [-1.56522787  0.12584145 -0.47427881 -0.47340772  0.36808875  0.58562523
   0.73728102]]
[[ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]]


In [7]:
print train_dataset.dtype
print train_labels.dtype

float32
float32


In [8]:
# TEST DATA
test_df = pd.read_csv('data/test.csv', header=0)        # Load the test file into a dataframe

# I need to do the same with the test data now, so that the columns are the same as the training data
# I need to convert all strings to integer classifiers:
# female = 0, Male = 1
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# Embarked from 'C', 'Q', 'S'
# All missing Embarked -> just make them embark from most common place
if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
    test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values
# Again convert all Embarked strings to int
test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)


# All the ages with no data -> make the median of all Ages
median_age = test_df['Age'].dropna().median()
if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
    test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age

# All the missing Fares -> assume median of their respective class
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]

# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 


# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
test_data = test_df.values.astype(np.float32)

In [9]:
graph = tf.Graph() 
num_hidden = 20

with graph.as_default():
    
    # Input data.
    # Load the training, validation data into constants that are
    # attached to the graph.
    tf_train_dataset = tf.constant(train_dataset)
    tf_train_labels = tf.constant(train_labels)
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_valid_labels = tf.constant(valid_labels)
    tf_test_dataset = tf.constant(test_data)
    
    # Variables
    # Parameters which are going to be trained.
    weights = tf.Variable(tf.truncated_normal([input_size, num_hidden]))
    biases = tf.Variable(tf.zeros([num_hidden]))
    weights2 = tf.Variable(tf.truncated_normal([num_hidden, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    hiddenLayer = tf.nn.relu(tf.matmul(tf_train_dataset, weights) + biases)
    secondLayer = tf.matmul(hiddenLayer, weights2) + biases2
  
    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(secondLayer, tf_train_labels))
  
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for training and validation sets.
    
    train_prediction = tf.nn.softmax(secondLayer)
    valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights) + biases), weights2) + biases2 )
    test_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights) + biases), weights2) + biases2 )

In [10]:
num_steps = 401

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

with tf.Session(graph=graph) as session:
    # This is a one-time coperation which ensures the parameters get initialized as
    # we described in the graph: random weights for the matrix, zeros for the
    # biases. 
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        # Run the computations. We tell .run() that we want to run the optimizer,
        # and get the loss value and the training predictions returned as numpy
        # arrays.
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(
                predictions, train_labels))
            # Calling .eval() on valid_prediction is basically like calling run(), but
            # just to get that one numpy array. Note that it recomputes all its graph
            # dependencies.
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
    
    print 'Predicting...'
    output = np.argmax(test_prediction.eval(),1)
    
    predictions_file = open("deeplearning.csv", "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId","Survived"])
    open_file_object.writerows(zip(ids, output))
    predictions_file.close()
    print 'Done.'

Initialized
Loss at step 0: 3.824332
Training accuracy: 38.7%
Validation accuracy: 68.7%


Loss at step 100: 0.403742
Training accuracy: 82.2%
Validation accuracy: 81.4%


Loss at step 200: 0.375527
Training accuracy: 83.2%
Validation accuracy: 79.7%


Loss at step 300: 0.361495
Training accuracy: 84.3%
Validation accuracy: 78.7%


Loss at step 400: 0.352226
Training accuracy: 85.2%
Validation accuracy: 79.7%
Predicting...
Done.


In [11]:
batch_size = 25
regularization = 0.001
hiddenLayer1 = 15
hiddenLayer2 = 15

graph = tf.Graph() 
with graph.as_default():
    
    # Input data.
    # Load the training, validation data into constants that are
    # attached to the graph.
    tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, input_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_valid_labels = tf.constant(valid_labels)
    tf_test_dataset = tf.constant(test_data)
    
    # Variables
    # Parameters which are going to be trained.
  
    layer1_weights = tf.Variable(tf.truncated_normal([input_size, hiddenLayer1]))
    layer1_biases = tf.Variable(tf.zeros([hiddenLayer1]))
    layer2_weigths = tf.Variable(tf.truncated_normal([hiddenLayer1, hiddenLayer2]))
    layer2_biases = tf.Variable(tf.zeros([hiddenLayer2]))
    layer3_weights = tf.Variable(tf.truncated_normal([hiddenLayer2, num_labels]))
    layer3_biases = tf.Variable(tf.zeros([num_labels]))
    
    # Model
    def model(data,training = False):
        # Computing first layer
        layer1 = tf.nn.relu(tf.matmul(data, layer1_weights) + layer1_biases)
        if training:
            layer1 = tf.nn.dropout(layer1, 0.75)
        # Computing second layer
        layer2 = tf.nn.relu(tf.matmul(layer1, layer2_weigths) + layer2_biases)
        if training:
            layer2 = tf.nn.dropout(layer2, 0.75)
        
        return tf.matmul(layer2, layer3_weights) + layer3_biases
  
    # Training computations
    logits = model(tf_train_dataset,True)
    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + regularization * ( tf.nn.l2_loss(layer1_weights) + tf.nn.l2_loss(layer2_weigths) + tf.nn.l2_loss(layer3_weights))
    
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for training and validation sets.
    
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [12]:
num_steps = 1051

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
            [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            # print('Learning rate at step %d %f' % (step, learning_rate.eval() ))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
            print 'Predicting...'
    
    output = np.argmax(test_prediction.eval(),1)
    
    predictions_file = open("doubleHiddenLayer.csv", "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId","Survived"])
    open_file_object.writerows(zip(ids, output))
    predictions_file.close()
    print 'Done.'

Initialized
Minibatch loss at step 0: 9.081197
Minibatch accuracy: 40.0%
Validation accuracy: 65.6%
Predicting...


Minibatch loss at step 50: 0.549266
Minibatch accuracy: 88.0%
Validation accuracy: 79.7%
Predicting...


Minibatch loss at step 100: 0.748935
Minibatch accuracy: 64.0%
Validation accuracy: 78.7%
Predicting...


Minibatch loss at step 150: 0.652652
Minibatch accuracy: 72.0%
Validation accuracy: 78.7%
Predicting...


Minibatch loss at step 200: 0.573830
Minibatch accuracy: 80.0%
Validation accuracy: 82.8%
Predicting...


Minibatch loss at step 250: 0.462683
Minibatch accuracy: 92.0%
Validation accuracy: 79.7%
Predicting...


Minibatch loss at step 300: 0.633374
Minibatch accuracy: 68.0%
Validation accuracy: 82.5%
Predicting...


Minibatch loss at step 350: 0.556082
Minibatch accuracy: 76.0%
Validation accuracy: 79.0%
Predicting...


Minibatch loss at step 400: 0.477148
Minibatch accuracy: 84.0%
Validation accuracy: 81.4%
Predicting...


Minibatch loss at step 450: 0.440259
Minibatch accuracy: 88.0%
Validation accuracy: 80.8%
Predicting...


Minibatch loss at step 500: 0.351321
Minibatch accuracy: 92.0%
Validation accuracy: 82.5%
Predicting...


Minibatch loss at step 550: 0.459708
Minibatch accuracy: 76.0%
Validation accuracy: 81.1%
Predicting...


Minibatch loss at step 600: 0.695222
Minibatch accuracy: 72.0%
Validation accuracy: 80.8%
Predicting...


Minibatch loss at step 650: 0.436098
Minibatch accuracy: 84.0%
Validation accuracy: 81.8%
Predicting...


Minibatch loss at step 700: 0.375798
Minibatch accuracy: 84.0%
Validation accuracy: 82.8%
Predicting...


Minibatch loss at step 750: 0.789337
Minibatch accuracy: 68.0%
Validation accuracy: 81.1%
Predicting...
Minibatch loss at step 800: 0.757821
Minibatch accuracy: 64.0%
Validation accuracy: 81.4%
Predicting...


Minibatch loss at step 850: 0.585755
Minibatch accuracy: 84.0%
Validation accuracy: 82.1%
Predicting...


Minibatch loss at step 900: 0.484738
Minibatch accuracy: 84.0%
Validation accuracy: 79.7%
Predicting...


Minibatch loss at step 950: 0.685635
Minibatch accuracy: 76.0%
Validation accuracy: 82.1%
Predicting...


Minibatch loss at step 1000: 0.551824
Minibatch accuracy: 76.0%
Validation accuracy: 80.1%
Predicting...


Minibatch loss at step 1050: 0.526847
Minibatch accuracy: 80.0%
Validation accuracy: 81.1%
Predicting...
Done.
