In [14]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV


In [15]:
np.random.seed(0)

In [16]:
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')

In [17]:
data = pd.read_csv(titanic_url)

In [18]:
print(data.shape)

(1309, 14)


In [19]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## PreProcess
    Data using sklearn's Column Transformer Approach
    Let's Create preprocessing PipeLines for bot numeric and Categorical Data

In [27]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


categorical_features = ['embarked', 'sex', 'pclass']

# Replacing Missing Values with Modal Values Than One Hot Encoding

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Final PreProcessor Object set up with ColumnTransformer

preprocess = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Target = Survived

In [28]:
y = data['survived']
y = y.map({0: 'died', 1: 'survived'})

In [29]:
# Keep Only PClass, Sex, Age, Fare, Embarked Features
X = data.drop(['survived', 'sibsp', 'parch', 'ticket', 'name', 'cabin', 'boat', 'body', 'home.dest'], axis = 1)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
preprocess = preprocess.fit(X_train)

## Function
    To Transform Data with PreProcessor 

In [34]:
def preprocessor(data):
    preprocessed_data = preprocess.transform(data)
    return preprocessed_data

In [35]:
X_train.shape

(1047, 5)

### Here Important Thing is to Notice that Categorical Feature Columns have been One Hot Encoded

In [36]:
preprocessor(X_train)

array([[ 1.88524568, -0.1454686 ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.11201029,  0.76432759,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.11201029, -0.52368059,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.72655059, -0.49591248,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.11201029, -0.4901675 ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.11201029, -0.53760642,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

# Building Own Model Using SkLearn

In [37]:
print(X_train.shape, X_test.shape,
     y_train.shape, y_test.shape)

(1047, 5) (262, 5) (1047,) (262,)


## Penalized Logit

In [40]:
hyperparameters = {'C':np.logspace(1, 10, 100), 'penalty':['l2']}

logit = LogisticRegression()
logit_cv = GridSearchCV(logit, hyperparameters, cv = 10)
logit_cv.fit(preprocessor(X_train), y_train)

print("Best ParaMeters {:.3f}", logit_cv.best_params_)

Best ParaMeters {:.3f} {'C': 10.0, 'penalty': 'l2'}


In [41]:
logit_cv.best_estimator_

In [42]:
model = LogisticRegression(C = 10, penalty = 'l2')
model.fit(preprocessor(X_train), y_train)
model.score(preprocessor(X_train), y_train)

0.7841451766953199

In [43]:
y_pred = model.predict(preprocessor(X_test))

In [44]:
y_pred

array(['died', 'died', 'died', 'died', 'died', 'died', 'died', 'died',
       'died', 'died', 'died', 'died', 'died', 'died', 'died', 'survived',
       'survived', 'died', 'died', 'survived', 'survived', 'survived',
       'survived', 'survived', 'died', 'survived', 'died', 'survived',
       'died', 'survived', 'died', 'survived', 'died', 'died', 'died',
       'died', 'died', 'died', 'survived', 'survived', 'died', 'survived',
       'survived', 'died', 'died', 'died', 'survived', 'died', 'died',
       'died', 'died', 'died', 'died', 'survived', 'died', 'died', 'died',
       'died', 'died', 'survived', 'died', 'died', 'survived', 'died',
       'survived', 'survived', 'survived', 'survived', 'died', 'survived',
       'died', 'died', 'died', 'died', 'died', 'died', 'died', 'survived',
       'survived', 'died', 'died', 'survived', 'died', 'survived', 'died',
       'died', 'died', 'survived', 'died', 'died', 'died', 'died', 'died',
       'survived', 'died', 'died', 'died', 'died'

In [45]:
from sklearn.metrics import accuracy_score

In [47]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred)*100))

Accuracy: 79.39%
