In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)

# Load the Data

In [5]:
passengers= pd.read_csv('passengers.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Update sex column to numerical

In [7]:
passengers['Sex'] = np.where(passengers.Sex=='female',1,0)
print(passengers.head())

# Fill the nan values in the age column
passengers['Age'] = passengers.Age.fillna(passengers.Age.mean())
print(passengers['Age'].values)
print(passengers.Age.isna().sum())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare Cabin Embarked  
0         A/5 21171   7.2500   NaN        S  
1          PC 17599  71.2833   C85        C  
2  STON/O2. 3101282   7.9250   NaN        S  
3            113803  53.1000  C123        S  
4            373450   8.0500   NaN        S  
[22.         38.    

## Create a first and second class column

In [8]:
print(passengers.Pclass.value_counts())
passengers['FirstClass'] = np.where(passengers.Pclass ==1,1,0)

3    491
1    216
2    184
Name: Pclass, dtype: int64


In [12]:
passengers['SecondClass'] = np.where(passengers.Pclass==2,1,0)
print(passengers.SecondClass.value_counts())

0    707
1    184
Name: SecondClass, dtype: int64


In [13]:
# Select the desired features
features = passengers[['Sex','Age','FirstClass','SecondClass']]
survival = passengers['Survived']

## Perform train, test, split

In [15]:
x_train,x_test,y_train,y_test = train_test_split(features,survival,train_size = 0.8,test_size = 0.2)
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Create and train and score the model

In [17]:
regr = LogisticRegression()
regr.fit(x_train,y_train)

# Score the model on the train data
print('score of model: ', regr.score(x_train,y_train))

# Score the model on the test data
print('score of test data: ',regr.score(x_test,y_test))

score of model:  0.6853932584269663
score of test data:  0.7206703910614525


In [18]:
# Analyze the coefficients
print('feature coefficients:',regr.coef_)

feature coefficients: [[ 0.         -0.43881306  0.89266663  0.51517908]]


## Sample passenger features

In [21]:
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Seyram = np.array([0.5,25.0,0.0,1.0])

# Combine passenger arrays
sample_passengers = np.array([Jack,Rose,Seyram])
print(sample_passengers)




[[ 0.  20.   0.   0. ]
 [ 1.  17.   1.   0. ]
 [ 0.5 25.   0.   1. ]]
[0 0 0]
[[9.99907881e-01 9.21190417e-05]
 [9.99161646e-01 8.38354390e-04]
 [9.99982811e-01 1.71890125e-05]]


In [22]:
# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
print(sample_passengers)

[[ 0.         -0.72154913 -0.58383755 -0.50350881]
 [ 1.         -0.95188092  1.7128052  -0.50350881]
 [ 0.5        -0.33766283 -0.58383755  1.98606255]]


In [23]:
# Make survival predictions!
print(regr.predict(sample_passengers))

print(regr.predict_proba(sample_passengers))

[0 1 1]
[[0.72711215 0.27288785]
 [0.23664011 0.76335989]
 [0.46652491 0.53347509]]
