# Titanic classification with logistic regression
## Aim: find out the importance of features.

### Load data

In [3]:
import pandas as pd
data = pd.read_csv('../data/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Preprocessing

In [4]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
# select columns
data_clean = data[['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
# remove rows with NAs
data_clean = data_clean.dropna(axis='index')
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [6]:
y = data_clean.Survived
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [7]:
X = data_clean[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

X['female'] = X['Sex'].map({'female': 1, 'male': 0})
X = X.drop('Sex', 1)

X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female
0,3,22.0,1,0,7.25,0
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.925,1
3,1,35.0,1,0,53.1,1
4,3,35.0,0,0,8.05,0


### Apply logistic regression model

In [8]:
from sklearn.linear_model import LogisticRegression
import numpy as np

In [9]:
lr = LogisticRegression()
lr.fit(y=y, X=X)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
# examine model parameters
print('intercept:')
print(lr.intercept_)
print('features:')
print(X.columns)
print('coefs:')
print(lr.coef_)
print('classes:')
print(lr.classes_)

intercept:
[ 2.05771716]
features:
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female'], dtype='object')
coefs:
[[-1.04281833 -0.03626434 -0.33615341 -0.06279599  0.00370982  2.5018024 ]]
classes:
[0 1]


In [25]:
coefficients = pd.concat([pd.DataFrame(X.columns), 
                          pd.DataFrame(np.transpose(lr.coef_)),
                          pd.DataFrame(np.exp(np.transpose(lr.coef_)))],
                         axis=1)
coefficients.columns = ['feature', 'logit', 'odds ratio']
coefficients

Unnamed: 0,feature,logit,odds ratio
0,Pclass,-1.042818,0.35246
1,Age,-0.036264,0.964385
2,SibSp,-0.336153,0.714513
3,Parch,-0.062796,0.939135
4,Fare,0.00371,1.003717
5,female,2.501802,12.204471


In [24]:
# evaluate model accurracy in cross validation
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(lr, X, y, cv=20)
mean_accuracy = np.mean(accuracy) * 100
print(u"Mean accuracy: %.2f %%"% mean_accuracy)

Mean accuracy: 79.16 %
