In [2]:
# Case study on emplotying Machine Learning to predict Titanic survival

"""Unfortunately, there were not enough lifeboats for everyone on board, resulting 
in the deaths of 1,502 out of 2,224 passengers and crew. 
While there was an element of luck in survival, it appears that certain groups of 
people were more likely to survive than others.

Here, your challenge is to build a predictive model that can give a solution to the 
question, “What types of people were more likely to survive?” using passenger data 
(i.e. name, age, sex, socio-economic class, etc.)."""

'Unfortunately, there were not enough lifeboats for everyone on board, resulting \nin the deaths of 1,502 out of 2,224 passengers and crew. \nWhile there was an element of luck in survival, it appears that certain groups of \npeople were more likely to survive than others.\n\nHere, your challenge is to build a predictive model that can give a solution to the \nquestion, “What types of people were more likely to survive?” using passenger data \n(i.e. name, age, sex, socio-economic class, etc.).'

In [3]:
# I am gonna use a classic dataset which relates to passenger survival rates from here: "https://github.com/amankharwal/Website-data/blob/master/titanic.rar"

In [4]:
# Loading test and training dataset using pandas:

import pandas as pd
train = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv')
train[:4]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [5]:
# Finding missing data:

train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
# I would like to use Age as a predictor, but data is missing. 
# There are several ways to do missing data imputation, 
# but I’ll make a simple one and use the median of the 
# training dataset to fill in the null values in both tables

impute_value = train['Age'].median()
train['Age'] = train['Age'].fillna(impute_value)
test['Age'] = test['Age'].fillna(impute_value)

In [8]:
# We now need to specify our models. I’ll add an IsFemale column as the encoded version of the ‘Sex’ column

train['IsFemale'] = (train['Sex'] == 'female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

In [9]:
# Next, I'll decide on some model variables and create NumPy arrays

predictors = ['Pclass', 'IsFemale', 'Age']
X_train = train[predictors].values
X_test = test[predictors].values
y_train = train['Survived'].values
X_train[:5]

array([[ 3.,  0., 22.],
       [ 1.,  1., 38.],
       [ 3.,  1., 26.],
       [ 1.,  1., 35.],
       [ 3.,  0., 35.]])

In [10]:
# Now I’m going to use the LogisticRegression model from scikit-learn and create a model instance:

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [11]:
# Now we can fit this model to the training data using the scikit-learn’s fit method:

model.fit(X_train, y_train)

LogisticRegression()

In [12]:
# Now, we can make predictions on the test dataset using model.predict:

y_predict = model.predict(X_test)
y_predict[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)