In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

data_loc = "data/titanic/"
titanic = pd.read_csv(data_loc + "titanic.csv")

titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# Convert sex to a binary value of either 1 for male or 0 for female
def filter_sex(df):
    sex_series = pd.Series( np.where( df.Sex == 'male' , 1 , 0 ) , name = 'Sex' )
    return sex_series

In [6]:
print(filter_sex(titanic).head())

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64


In [7]:
# Fill missing values of Age with the average of Age (mean)
def filter_age(df):
    age_df = pd.DataFrame() # Create a new dataframe to store our results
    age_df[ 'Age' ] = df.Age.fillna(df.Age.mean())
    return age_df

In [8]:
print(filter_age(titanic).head())

    Age
0  22.0
1  38.0
2  26.0
3  35.0
4  35.0


In [9]:
# Fill missing values of fare with the average of the fare (mean)
def filter_fare(df):
    fare_df = pd.DataFrame() # Create a new dataframe to store our results
    fare_df['fare'] = df.Fare.fillna(df.Fare.mean())
    return fare_df

In [10]:
print(filter_fare(titanic).head())

      fare
0   7.2500
1  71.2833
2   7.9250
3  53.1000
4   8.0500


In [11]:
def filter_title(df):
    title_df = df[ 'Name' ].map( lambda name: name.split( ',' )[1].split( '.' )[0].strip() )

    # a map of more aggregated titles
    Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"

                        }

    # we map each title
    title_df = title_df.map( Title_Dictionary )
    title_df = pd.get_dummies( title_df )
    return title_df

In [12]:
print(filter_title(titanic).head())

   Master  Miss  Mr  Mrs  Officer  Royalty
0       0     0   1    0        0        0
1       0     0   0    1        0        0
2       0     1   0    0        0        0
3       0     0   0    1        0        0
4       0     0   1    0        0        0


In [13]:
def filter_cabin(df):
    cabin_df = pd.DataFrame()

    # replacing missing cabins with U (for Uknown)
    cabin_df[ 'Cabin' ] = df.Cabin.fillna( 'U' )

    # mapping each Cabin value with the cabin letter
    cabin_df[ 'Cabin' ] = cabin_df[ 'Cabin' ].map( lambda c : c[0] )

    # dummy encoding ...
    cabin_df = pd.get_dummies( cabin_df['Cabin'] , prefix = 'Cabin' )
    return cabin_df

In [14]:
print(filter_cabin(titanic).head())

   Cabin_A  Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  \
0        0        0        0        0        0        0        0        0   
1        0        0        1        0        0        0        0        0   
2        0        0        0        0        0        0        0        0   
3        0        0        1        0        0        0        0        0   
4        0        0        0        0        0        0        0        0   

   Cabin_U  
0        1  
1        0  
2        1  
3        0  
4        1  


In [15]:
def cleanTicket( ticket ):
    ticket = ticket.replace( '.' , '' )
    ticket = ticket.replace( '/' , '' )
    ticket = ticket.split()
    ticket = map( lambda t : t.strip() , ticket )
    ticket = list(filter( lambda t : not t.isdigit() , ticket ))
    if len( ticket ) > 0:
        return ticket[0]
    else: 
        return 'XXX'

def filter_ticket(df):
    ticket_df = pd.DataFrame()

    # Extracting dummy variables from tickets:
    ticket_df[ 'Ticket' ] = df[ 'Ticket' ].map( cleanTicket )
    ticket_df = pd.get_dummies( ticket_df[ 'Ticket' ] , prefix = 'Ticket' )
    
    return ticket_df

In [16]:
print(filter_ticket(titanic).head())

   Ticket_A4  Ticket_A5  Ticket_AS  Ticket_C  Ticket_CA  Ticket_CASOTON  \
0          0          1          0         0          0               0   
1          0          0          0         0          0               0   
2          0          0          0         0          0               0   
3          0          0          0         0          0               0   
4          0          0          0         0          0               0   

   Ticket_FC  Ticket_FCC  Ticket_Fa  Ticket_LINE     ...      Ticket_SOPP  \
0          0           0          0            0     ...                0   
1          0           0          0            0     ...                0   
2          0           0          0            0     ...                0   
3          0           0          0            0     ...                0   
4          0           0          0            0     ...                0   

   Ticket_SOTONO2  Ticket_SOTONOQ  Ticket_SP  Ticket_STONO  Ticket_STONO2  \
0        

In [17]:
def filter_family(df):
    family_df = pd.DataFrame()

    # introducing a new feature : the size of families (including the passenger)
    family_df[ 'FamilySize' ] = df[ 'Parch' ] + df[ 'SibSp' ] + 1

    # introducing other features based on the family size
    family_df[ 'Family_Single' ] = family_df[ 'FamilySize' ].map( lambda s : 1 if s == 1 else 0 )
    family_df[ 'Family_Small' ]  = family_df[ 'FamilySize' ].map( lambda s : 1 if 2 <= s <= 4 else 0 )
    family_df[ 'Family_Large' ]  = family_df[ 'FamilySize' ].map( lambda s : 1 if 5 <= s else 0 )
    return family_df

In [18]:
print(filter_family(titanic).head())

   FamilySize  Family_Single  Family_Small  Family_Large
0           2              0             1             0
1           2              0             1             0
2           1              1             0             0
3           2              0             1             0
4           1              1             0             0


In [19]:
def filter_survived(df):
    df[ 'Survived' ].map( lambda s : 1 if s == 1 else 0 )

In [58]:
# Now mash it all together into one big dataframe
def format_data(df):
    sex_series = filter_sex(df)
    age_df = filter_age(df)
    fare_df = filter_fare(df)
    title_df = filter_title(df)
    cabin_df = filter_cabin(df)
    ticket_df = filter_ticket(df)
    family_df = filter_family(df)
    
    x_data = pd.concat( [age_df, fare_df, title_df, cabin_df, ticket_df, family_df, sex_series ] , axis=1 )
    if 'Survived' in df:
        y_data = df.Survived
    else:
        y_data = None
    return x_data, y_data

In [59]:
print(format_data(titanic)[1].head())

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [60]:
import tensorflow as tf

data_x, data_y = format_data(titanic)
train_x , test_x , train_y , test_y = train_test_split( data_x , data_y , train_size = .8 )

learning_rate = 0.001
epochs = 1000

input_nodes = 53
output_nodes = 1
batch_size = 10

x = tf.placeholder('float', [None, input_nodes])
y = tf.placeholder('float', [None, output_nodes])

hidden_1_nodes = 100

hlw_1 = tf.Variable(tf.random_normal([input_nodes, hidden_1_nodes])) # The weights of the first hidden layer
hlb_1 = tf.Variable(tf.random_normal([hidden_1_nodes])) # The biases of the first hidden layer

outw = tf.Variable(tf.random_normal([hidden_1_nodes, output_nodes])) # The weights of the output layer
outb = tf.Variable(tf.random_normal([output_nodes])) # The biases of the output layer

In [61]:
def forward_propagation(x):
    hlout_1 = tf.nn.sigmoid(tf.matmul(x, hlw_1) + hlb_1) # input multiplied by hidden layer 1 weights. Add bias. Apply sigmoid
    output = tf.nn.sigmoid(tf.matmul(hlout_1, outw) + outb) # hidden layer 2 output multiplied by output layer weights. add bias. DON'T add sigmoid
    return output

In [62]:
def backward_propagation(y_, y, learning_rate):
    cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=y_, labels=y)) #Error/loss function. Cost is the total error over the batch
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) #Adam optimizer is a type of gradient descent. We want it to use the learning rate and minimize that cost
    return optimizer, cost

In [63]:
def calculate_accuracy(y_, y):
    correct = tf.equal(tf.round(y_), tf.round(y)) # argmax finds the index of the largest value in a tensor. So we're comparing the two using tf.equal
    accuracy = tf.reduce_mean(tf.cast(correct, 'float')) # We've got a tensor of true and false statements now. Let's find the average amount
    return accuracy, correct

In [64]:
y_ = forward_propagation(x)
optimizer, cost = backward_propagation(y_, y, learning_rate)
accuracy = calculate_accuracy(y_, y)

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

cost_per_epoch = [] 
accuracy_per_epoch = []

# Loop over the dataset 'epochs' amount of times
epoch_cost = 0
for epoch in range(epochs):
    # Go through the dataset in batches
    for i in range(int(len(train_x)/batch_size)):
        batch_x = train_x[i*batch_size:i*batch_size + batch_size]
        batch_y = train_y[i*batch_size:i*batch_size + batch_size]
        
        batch_x = np.reshape(batch_x, [len(batch_x), input_nodes])
        batch_y = np.reshape(batch_y, [len(batch_y), output_nodes])
        
        _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y}) # Run our forward prop and backward prop over this batch of training data
        epoch_cost += c
    if (epoch != 0 and epoch % 100 == 0 or epoch == epochs):
        # Test the accuracy of our model against the training set
        epoch_accuracy, _ = sess.run(accuracy, feed_dict={x: np.reshape(train_x, [len(train_x), input_nodes]), y: np.reshape(train_y, [len(train_y), output_nodes])}) # Pass all the labels through to calculate accuracy
        print('Epoch', epoch, 'completed out of', epochs, '\ncost:', epoch_cost, '\ntraining accuracy:', epoch_accuracy, '\n')
        # Store the results of this epoch so we can make nice graphs later
        cost_per_epoch.append(epoch_cost)
        accuracy_per_epoch.append(epoch_accuracy) 
        epoch_cost = 0

  return getattr(obj, method)(*args, **kwds)


Epoch 100 completed out of 1000 
cost: 4474.04432097 
training accuracy: 0.856742 

Epoch 200 completed out of 1000 
cost: 4276.47513393 
training accuracy: 0.879214 

Epoch 300 completed out of 1000 
cost: 4222.00954205 
training accuracy: 0.894663 

Epoch 400 completed out of 1000 
cost: 4191.18669003 
training accuracy: 0.91573 

Epoch 500 completed out of 1000 
cost: 4172.57460147 
training accuracy: 0.912921 

Epoch 600 completed out of 1000 
cost: 4159.51380223 
training accuracy: 0.911517 

Epoch 700 completed out of 1000 
cost: 4147.29245582 
training accuracy: 0.917135 

Epoch 800 completed out of 1000 
cost: 4135.77127883 
training accuracy: 0.919944 

Epoch 900 completed out of 1000 
cost: 4128.69613525 
training accuracy: 0.919944 



In [None]:
plt.plot(cost_per_epoch)
plt.show()
print("Cost each 100 iterations")
plt.plot(accuracy_per_epoch)
plt.show()
print("Accuracy each 100 iteration")

In [None]:
(test_accuracy, test_correct_list), outputs = sess.run([accuracy, y_], feed_dict={x: np.reshape(test_x, [len(test_x), input_nodes]), y: np.reshape(test_y, [len(test_y), output_nodes])})
print("training accuracy:", accuracy_per_epoch[-1])
print("testing accuracy:", test_accuracy)

In [None]:
def predict_passenger_outcome(passengers):
    
    data, _ = format_data(passengers)
    result = sess.run(y_, feed_dict={x: data[:].reshape(len(data, input_nodes))})
    return result

def setup_passenger(Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked):
    passenger = {
        'Pclass':Pclass,
        'Name':Name,
        'Sex':Sex,
        'Age':Age,
        'SibSp':SibSp,
        'Parch':Parch,
        'Ticket':Ticket,
        'Fare': Fare,
        'Cabin': Cabin,
        'Embarked':Embarked
    }
    return passenger

In [None]:
jack = setup_passenger(3, 'Braund, Mr. Owen Harris', 'male', 22.0, 1, 0, 'A/5 21171', 7.2500, 'NaN', 'S')
rose = setup_passenger(3, 'Braund, Mr. Owen Harris', 'female', 22.0, 1, 0, 'A/5 21171', 7.2500, 'NaN', 'S')

passengers = pd.DataFrame()
passengers.append(jack)
passengers.append(rose)

print(predict_passenger_outcome(passengers))
