# Titanic classification with logistic regression
## Aim: find out the importance of features.

### Load data

In [2]:
import pandas as pd
data = pd.read_csv('../data/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Preprocessing

In [3]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
# select columns
data_clean = data[['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
# remove rows with NAs
data_clean = data_clean.dropna(axis='index')
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [5]:
y = data_clean.Survived
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [6]:
X = data_clean[['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']]

X['female'] = X['Sex'].map({'female': 1, 'male': 0})
X = X.drop('Sex', 1)

X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Pclass,Age,SibSp,Fare,female
0,3,22.0,1,7.25,0
1,1,38.0,1,71.2833,1
2,3,26.0,0,7.925,1
3,1,35.0,1,53.1,1
4,3,35.0,0,8.05,0


### Apply logistic regression model

In [7]:
from sklearn.linear_model import LogisticRegression
import numpy as np

In [8]:
lr = LogisticRegression()
lr.fit(y=y, X=X)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
# examine model parameters
print('intercept:')
print(lr.intercept_)
print('features:')
print(X.columns)
print('coefs:')
print(lr.coef_)
print('classes:')
print(lr.classes_)

intercept:
[ 2.07612481]
features:
Index(['Pclass', 'Age', 'SibSp', 'Fare', 'female'], dtype='object')
coefs:
[[-1.05396875 -0.03614654 -0.3522106   0.00338278  2.48039438]]
classes:
[0 1]


In [10]:
coefficients = pd.concat([pd.DataFrame(X.columns), 
                          pd.DataFrame(np.transpose(lr.coef_)),
                          pd.DataFrame(np.exp(np.transpose(lr.coef_)))],
                         axis=1)
coefficients.columns = ['feature', 'regression_parameter', 'odds ratio']
coefficients

Unnamed: 0,feature,logit,odds ratio
0,Pclass,-1.053969,0.348552
1,Age,-0.036147,0.964499
2,SibSp,-0.352211,0.703132
3,Fare,0.003383,1.003389
4,female,2.480394,11.945975


In [11]:
# evaluate model accurracy in cross validation
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(lr, X, y, cv=20)
mean_accuracy = np.mean(accuracy) * 100
print(u"Mean accuracy: %.2f %%"% mean_accuracy)

Mean accuracy: 79.02 %


In [12]:
from statsmodels.discrete.discrete_model import Logit

In [13]:
logit = Logit(y, X)
logit = logit.fit()
logit.summary()

Optimization terminated successfully.
         Current function value: 0.464048
         Iterations 6


0,1,2,3
Dep. Variable:,Survived,No. Observations:,714.0
Model:,Logit,Df Residuals:,709.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 05 Sep 2017",Pseudo R-squ.:,0.313
Time:,17:05:29,Log-Likelihood:,-331.33
converged:,True,LL-Null:,-482.26
,,LLR p-value:,4.312e-64

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Pclass,-0.5406,0.078,-6.955,0.000,-0.693,-0.388
Age,-0.0148,0.005,-2.693,0.007,-0.026,-0.004
SibSp,-0.3159,0.121,-2.604,0.009,-0.554,-0.078
Fare,0.0091,0.003,3.287,0.001,0.004,0.015
female,2.6118,0.203,12.852,0.000,2.213,3.010


In [15]:
# show logit and oddsratio
regression_parameter = pd.DataFrame(logit.params, columns=['regression_parameter'])
odds_ratios = pd.DataFrame(np.exp(logit.params), columns=['odds_ratio'])
pd.concat([regression_parameter, odds_ratios], axis = 'columns')


Unnamed: 0,regression_parameter,odds_ratio
Pclass,-0.540614,0.582391
Age,-0.014771,0.985337
SibSp,-0.315924,0.729115
Fare,0.0091,1.009142
female,2.611775,13.623208
