In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the passenger data
passengers = pd.read_csv('passengers.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S
4,5,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S


In [3]:
passengers.isna().sum()

PassengerId      0
Survived         1
Pclass           1
Name             1
Sex              1
Age            178
SibSp            1
Parch            1
Ticket           1
Fare             1
Cabin          688
Embarked         3
dtype: int64

In [4]:
# Update sex column to numerical
passengers.Sex = passengers.Sex.map({'male': 0, 'female': 1}).fillna(1)
passengers.Sex = passengers.Sex.astype(int)

In [5]:
# Fill the nan values in the age column
passengers.Age.fillna(value = int(round(passengers.Age.mean())), inplace = True)
passengers.Survived.fillna(value = int(round(passengers.Survived.mean())), inplace = True)

passengers.Age = passengers.Age.astype(int)
passengers.Survived = passengers.Survived.astype(int)

In [6]:
# Create a first class column
passengers['FirstClass'] = passengers.Pclass.apply(lambda x: 1 if x==1 else 0)
# Create a second class column
passengers['SecondClass'] = passengers.Pclass.apply(lambda x: 1 if x==2 else 0)
# Create a third class column
passengers['ThirdClass'] = passengers.Pclass.apply(lambda x: 1 if x==3 else 0)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass,ThirdClass
0,1,0,3.0,"Braund, Mr. Owen Harris",0,22,1.0,0.0,A/5 21171,7.25,,S,0,0,1
1,2,1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1.0,0.0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3.0,"Heikkinen, Miss. Laina",1,26,0.0,0.0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1.0,0.0,113803,53.1,C123,S,1,0,0
4,5,0,3.0,"Allen, Mr. William Henry",0,35,0.0,0.0,373450,8.05,,S,0,0,1


In [7]:
# Select the desired features
features = passengers[["Sex", "Age", "FirstClass", "SecondClass", "ThirdClass"]]
outcomes = passengers.Survived

In [8]:
# Perform train, test, split
train_features , test_features, train_labels, test_labels = train_test_split(features, outcomes)

In [10]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [11]:
# Create and train the model
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression()

In [12]:
# Score the model on the train data
model.score(train_features, train_labels)

0.796711509715994

In [14]:
# Score the model on the test data
model.score(test_features, test_labels)

0.820627802690583

In [34]:
# Analyze the coefficients
print(dict(zip(['Sex','Age','FirstClass','SecondClass', 'ThirdClass'],model.coef_[0])))


{'Sex': 1.1164746919049529, 'Age': -0.4704494619593107, 'FirstClass': 0.8917745702994773, 'SecondClass': 0.46122382596527084, 'ThirdClass': -0.023876919769469977}


In [65]:
# Sample passenger features
Jack = np.array([0,20,0,0,1])
Rose = np.array([1,17,1,0,0])
You = np.array([0,20,0,1,0])

In [66]:
sample_passengers = np.array([Jack, Rose, You])

In [67]:
# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
sample_passengers



array([[-0.73584961, -0.72111666, -0.53529287, -0.49813083,  0.85624799],
       [ 1.35897333, -0.95541828,  1.86813621, -0.49813083, -1.16788595],
       [-0.73584961, -0.72111666, -0.53529287,  2.00750472, -1.16788595]])

In [68]:
# Make survival predictions!
predictions = model.predict(sample_passengers)
predictions_proba = model.predict_proba(sample_passengers)
print(dict(zip(["Jack","Rose","You"], predictions)))
print(dict(zip(["Jack","Rose","You"], predictions_proba)))

{'Jack': 0, 'Rose': 1, 'You': 0}
{'Jack': array([0.86365707, 0.13634293]), 'Rose': array([0.05761188, 0.94238812]), 'You': array([0.65520849, 0.34479151])}
