In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


In [58]:
df = pd.read_csv('titanic.csv')

In [59]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [60]:
target = df.pop('Survived')
df['Survived'] = target

In [61]:
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


In [62]:
df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Survived         0
dtype: int64

In [63]:
# Drop irrelevent Columns
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

In [64]:
df.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

In [65]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']), df['Survived'], test_size=0.2, random_state=42)

### # Create Column transformers

In [66]:
# Imputation missing values

trans_1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

In [67]:
# Encoding Categorical Columns

trans_2 = ColumnTransformer([
    ('ohe_sex_&_enbarked', OneHotEncoder(handle_unknown='ignore', sparse=False),[1,6])
], remainder='passthrough')

In [68]:
# Scaling

trans_3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,7))
])

In [69]:
# Feature selection

trans_4 = SelectKBest(score_func=chi2, k=5)

In [70]:
# Train The Model

trans_5 = RandomForestClassifier()

### Creating Pipeline

In [71]:
pipe = Pipeline([
    ('Imputation', trans_1),
    ('Encoding', trans_2),
    ('Scaling', trans_3),
    ('Feature_selection', trans_4),
    ('Train_model', trans_5)
])

In [72]:
# Displaying Pipeline
from sklearn import set_config
set_config(display='diagram')

### Train Model

In [73]:
pipe.fit(x_train,y_train)



In [74]:
pipe.named_steps

{'Imputation': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'Encoding': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_&_enbarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'Scaling': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 7, None))]),
 'Feature_selection': SelectKBest(k=5, score_func=<function chi2 at 0x0000023C6114F420>),
 'Train_model': RandomForestClassifier()}

### Prediction

In [75]:
y_pred =  pipe.predict(x_test)
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

### Matrix Evalutation

In [76]:
from sklearn.metrics import accuracy_score

In [77]:
accuracy_score(y_test, y_pred)

0.6256983240223464

### Cross Validation

In [78]:
from sklearn.model_selection import cross_val_score

In [79]:
cross_val_score(pipe, x_train, y_train, cv=5, scoring='accuracy').mean()





0.6391214419383433

### Hyperparameter Tuning with Grid Search

In [80]:
from sklearn.model_selection import GridSearchCV

In [102]:
# param_grid = {
#     'trans_5__n_estimators' : [100, 200, 300],
#     'trans_5__criterion' : ['gini', 'entropy'],
#     'trans_5__max_depth' : [3, 4, 5, 6, 7],
#     # # 'max_features' : ['sqrt', 'log2'],
#     # 'trans_5__random_state' : [0, 42, 123]
# }

param_grid = {
                 "Train_model__n_estimators": [10, 100, 1000],
                 "Train_model__max_depth":[5,8,15,25,30,None],
                 "Train_model__min_samples_leaf":[1,2,5,10,15,100],
                 "Train_model__max_leaf_nodes": [2, 5,10]
}

In [103]:
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

#### Fit the model

In [104]:
grid.fit(x_train, y_train)



In [105]:
grid.best_score_

0.6433172461341475

In [106]:
grid.best_params_

{'Train_model__max_depth': 15,
 'Train_model__max_leaf_nodes': 5,
 'Train_model__min_samples_leaf': 15,
 'Train_model__n_estimators': 10}

In [107]:
grid.best_estimator_