# Starter code to approach the Titanic dataset with a different perspective

In [1]:
import pandas as pd
import numpy as np
from keras import Sequential
from keras.layers import Dense
from sklearn.preprocessing import  MinMaxScaler
from keras.wrappers.scikit_learn import KerasClassifier # keras in-built classifier function 

Using TensorFlow backend.


In [2]:
df = pd.read_csv('../input/train.csv')
df1 = pd.read_csv('../input/test.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
X = df.drop(columns = ['PassengerId', 'Ticket', 'Survived', 'Cabin'])# Id and Ticket are least likely to show any trend which is useful for the prediction. Cabin is missing a lot of values and can't contribute efficiently in predicting 'Survived'.
y = df['Survived'] # Label which is to be predicted using features X.

# Preprocessing

In [5]:
def preprocessing(X, y):
    title = []
    for i in range(X.shape[0]):
        title.append(X['Name'][i].split(', ')[1].split('.')[0]) # extracting designations from name column.
    X['Title'] = title
    # Following 4 lines are to classify the designations into most common and rare ones.
    X['Title'] = X['Title'].replace(['Lady', 'the Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    X['Title'] = X['Title'].replace('Mlle', 'Miss')
    X['Title'] = X['Title'].replace('Ms', 'Miss')
    X['Title'] = X['Title'].replace('Mme', 'Mrs')
    X = X.drop(columns = ['Name'])
    # Following for loop replaces the missing values of a column by the mean of existing values or their mode value depending on their datatypes(int and float => mean, object => mode).
    for i in X:
        if X[i].dtype == 'int64':
            X[i].fillna(X[i].mean(), inplace = True)
        elif X[i].dtype == 'float64':
            X[i].fillna(X[i].mean(), inplace = True)
        else:
            X[i].fillna(X[i].mode()[0], inplace = True)
    X = pd.get_dummies(X) # One-Hot Encoding of X,i.e., columns with object data types will be converted into an array of 1s and 0s, representing their presence along a row.
    sc= MinMaxScaler() # Scaling inputs is always an advantageous thing to do as it can increase the accuracy of the model by a considerable extent. 
    # MinMaxScaler scales using the logic: X = (X - X.min())/(X.max() - X.min()).
    X= sc.fit_transform(X)
    y= y.values.reshape(-1,1)
    y=sc.fit_transform(y)
    return X, y            

In [6]:
X, y = preprocessing(X, y)

In [7]:
X.shape

(891, 15)

# Defining the model 

In [8]:
# building the constituents of our classifier model,defining the number of hidden layers, number of nodes and activation function in each of them.
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units=15, input_dim=15))
    classifier.add(Dense(units=50))
    classifier.add(Dense(units = 50))
    classifier.add(Dense(units = 50))
    classifier.add(Dense(units=1, activation = 'relu'))
    classifier.compile(optimizer='adam', loss='mean_absolute_error',  metrics=['mae','accuracy']) # optimizer: adam works on minizing our loss function.
    return classifier

In [9]:
classifier = KerasClassifier(build_fn=build_classifier, batch_size=60,epochs=35) # defining the model.

In [10]:
classifier.fit(X,y) # fitting the input using the model.

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f21468ba630>

In [11]:
df1.head() # analyzing the test data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [12]:
X_test = df1.drop(columns = ['PassengerId', 'Ticket','Cabin'])

In [13]:
X_test, y_test = preprocessing(X_test, y = pd.Series(0)) # Pre-preocessing the test data. y = pd.Series(0) is used here just to ensure that there are 2 inputs to the funcion, it has no significance in our model.

In [14]:
y_pred = classifier.predict(np.array(X_test)) # predicting the required label,i.e., 'Survived'.
y_pred = y_pred.reshape(418, ).astype('int64')

In [15]:
submission = pd.DataFrame({'PassengerId':df1['PassengerId'],'Survived':y_pred}) # preparing a submission dataframe.

In [16]:
submission.head() 

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [17]:
filename = 'submission.csv'
submission.to_csv(filename, index = False) # converting the submission file to .csv format.

In [18]:
df2 = pd.read_csv('../input/gender_submission.csv')
df2.head()
y_test = df2['Survived']

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       266
           1       1.00      0.98      0.99       152

    accuracy                           0.99       418
   macro avg       0.99      0.99      0.99       418
weighted avg       0.99      0.99      0.99       418

