In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer

# Explore Data

In [2]:
df = pd.read_csv('train.csv')

### Remove 'Ticket', 'PassengerId','Cabin'

In [3]:
df.drop(['PassengerId','Ticket'], axis=1, inplace=True)

### Convert Names & Cabin to theri character length

In [4]:
df['Name'] = df['Name'].str.len()
df['Cabin'] = df['Cabin'].str.len()

### Select target and features

In [5]:
X = df.iloc[:,1:]
y = df['Survived']

### Split data to train and test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state= 42)

In [7]:
X_train.shape, X_test.shape

((534, 9), (357, 9))

### Categorizing numeric and categorical features

In [8]:
numeric_features = ['Name', 'Fare','Age','Cabin']
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="mean"), 
    RobustScaler()
    )

In [9]:
categorical_features = ['Embarked', 'Sex','Pclass']
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"), 
    OneHotEncoder(handle_unknown="ignore")
    )

### Pre-Process transformations

In [10]:
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat',categorical_transformer, categorical_features),
                  ('do_nothing', 'passthrough',['Parch','SibSp'])])

In [11]:
pipeline_lr = make_pipeline(preprocessor, LogisticRegression(max_iter=100))

### Fit the train data with 'LogisticRegression'

In [12]:
pipeline_lr.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('robustscaler',
                                                                   RobustScaler())]),
                                                  ['Name', 'Fare', 'Age',
                                                   'Cabin']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='

### Cross-Val vs. Shuffle-Split

In [39]:
from sklearn.model_selection import cross_val_score

cv = cross_val_score(pipeline_lr, X_train, y_train,
                    cv=5,
                    scoring='accuracy'
)
cv = pd.DataFrame(cv)
cv.mean()

0    0.805325
dtype: float64

In [40]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
ss_cv = cross_val_score(pipeline_lr, X_train, y_train, cv=ss)
ss_cv.mean()

0.8099378881987578