In [1]:
#import needed libraries
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import tensorflow as tf
import warnings

warnings.filterwarnings('ignore')

dataset = pd.read_csv("csv-data-files/titanic/test.csv")

#To get an idea about the dataset(Dataframe) use the describe() and head() functions in pandas
dataset.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [2]:
X = dataset.drop(['SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin'], axis=1)

# Encode Sex
sex_dummies = pd.get_dummies(X['Sex'], prefix="Sex")
X = X.drop(['Sex'], axis=1)
X = X.join(sex_dummies)
X.head(6)

# As we said earlier here also it is enough to have one column to represent sex. So let's remove one
X = X.drop(['Sex_female'], axis=1) #then in ramaining column mails will be 1 and females will be 0

# Then rename the column. inplace is to rename on the current DataFrame X without copping and making another X.
X.rename(columns={'Sex_male': 'Sex'}, inplace=True)
X.head()

# Put S to any missing (NaN) values of Embarked. First lets find how many NaNs are there in Embarked
print("Number of missing values in Embarked is: " + str(sum(X.Embarked.isnull())))

row_index = X.Embarked.isnull()
X.loc[row_index, 'Embarked'] = 'S'

#Encoding Embarked
embarked_dummies = pd.get_dummies(X.Embarked, prefix="Embarked")
X.drop(['Embarked'], axis=1, inplace=True)
X = pd.concat([X, embarked_dummies], axis=1)
X.head()

# As stated earlier lets remove one column from dummies (['Embarked_S'])
X.drop(['Embarked_S'], axis=1, inplace=True)

#Now lets go for column Age. Lets check all of Age column is filled
print("Number of missing values in Age :" + str(sum(X.Age.isnull())))

# Convert Names to their Titles (Mr, Miss, Mrs)
titles = X.Name.str.split(',').str[1]
titles = titles.str.split('\s+').str[1]
titles.head()
X.iloc[:,2] = pd.DataFrame(titles)
X.head()

gp = X.groupby('Name').mean()
title_mean_ages = []
title_mean_ages.append(gp.index)
title_mean_ages.append(X.groupby('Name').Age.mean())

n_training_samples = X.shape[0] #axis 0 is the column which has names of rows. or its the index.
n_titles = len(title_mean_ages[1])

for i in range(0, n_training_samples):
    if np.isnan(X.Age[i]) == True:
        for j in range(0, n_titles):
            if X.Name[i] == title_mean_ages[0][j]:
                X.Age[i] = title_mean_ages[1][j]


# Now we can drop name axis
X.drop(['Name'], axis=1, inplace=True)

# Lets feature Transform (Data normalization) the Age column so that its values are in between 0 and 1
# Lets use Min Max algorithum for that.
age_min = X['Age'].min()
age_max = X.Age.max()
range_ = age_max - age_min


for i in range(n_training_samples):
    X.Age[i] = (X.Age[i] - age_min) / range_

Number of missing values in Embarked is: 0
Number of missing values in Age :86


In [3]:
passengers = X['PassengerId']
X_withoult_passengerId = X.drop(['PassengerId'], axis=1)
X_numpy = X_withoult_passengerId.values #converting to a numpy ndarray
print("type of X_numpy is : "+ str(type(X_numpy)) + " and shape of it is : " + str(X_numpy.shape))
print(len(X_numpy))
X_numpy[417:418]

type of X_numpy is : <type 'numpy.ndarray'> and shape of it is : (418, 5)
418


array([[3.        , 0.09543018, 1.        , 1.        , 0.        ]])

In [4]:
# Define Placeholders
x = tf.placeholder(tf.float32, shape=[None, 5])

h1_nodes = 4
h2_nodes = 4
h3_nodes = 4
output_nodes = 1
batch_size = 10
import math

predict_array = []

# Create the model
def model(data):
    hidden_layer_1 = {
        'weights' : tf.Variable(tf.random_normal([5, h1_nodes])),
        'bias' : tf.Variable(tf.random_normal([h1_nodes]))
    }
    
    hidden_layer_2 = {
        'weights' : tf.Variable(tf.random_normal([h1_nodes, h2_nodes])),
        'bias' : tf.Variable(tf.random_normal([h2_nodes]))
    }
    
    hidden_layer_3 = {
        'weights' : tf.Variable(tf.random_normal([h2_nodes, h3_nodes])),
        'bias' : tf.Variable(tf.random_normal([h3_nodes]))
    }
    
    output_layer = {
        'weights' : tf.Variable(tf.random_normal([h3_nodes, output_nodes])),
        'bias' : tf.Variable(tf.random_normal([output_nodes]))
    }
    
    operation_h1 = tf.add(tf.matmul(data, hidden_layer_1['weights']), hidden_layer_1['bias'])
    operation_h1 = tf.nn.relu(operation_h1)
    
    operation_h2 = tf.add(tf.matmul(operation_h1, hidden_layer_2['weights']), hidden_layer_2['bias'])
    operation_h2 = tf.nn.relu(operation_h2)
    
    operation_h3 = tf.add(tf.matmul(operation_h2, hidden_layer_3['weights']), hidden_layer_3['bias'])
    operaion_h3 = tf.nn.relu(operation_h3)
    
    output = tf.add(tf.matmul(operation_h3, output_layer['weights']), output_layer['bias'])
    
    return output

def training(features):
    prediction = model(features)
    #save file location and Saver object
    save_file = './train_model.ckpt'
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, save_file)
        j = 1
        while j <= len(X_numpy):  
            predict = sess.run([prediction], feed_dict={x: X_numpy[j-1:j]})
            predict = tf.sigmoid(predict[0][0][0])
            if predict.eval() >= 0.5:
                predict_array.append(1)
            else:
                predict_array.append(0)
            
            j += 1

training(x)
predict_array[0]

INFO:tensorflow:Restoring parameters from ./train_model.ckpt


1

In [35]:
print(len(predict_array))
series1 = pd.Series(predict_array)
type(series1)

418


pandas.core.series.Series

In [36]:
print(len(passengers))
type(passengers)

418


pandas.core.series.Series

In [40]:
df1 = pd.DataFrame([passengers, series1])
df1.index = ['PassengerId','Survived']
df1 = df1.transpose()
df1.head()

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,0
2,894,0
3,895,0
4,896,0


In [43]:
df1.to_csv('solution_file.csv', index=False)