In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [5]:
df = pd.read_csv('titanic_dataset.csv')
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
222,1114,1,2,"Cook, Mrs. (Selena Rogers)",female,22.0,0,0,W./C. 14266,10.5,F33,S
56,948,0,3,"Cor, Mr. Bartol",male,35.0,0,0,349230,7.8958,,S
360,1252,0,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S
164,1056,0,2,"Peruschitz, Rev. Joseph Maria",male,41.0,0,0,237393,13.0,,S
225,1117,1,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C


In [6]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace = True)

In [7]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
381,0,3,male,26.0,0,0,7.8792,Q
320,0,3,male,26.0,0,0,7.775,S
298,0,1,male,30.0,0,0,45.5,S
211,0,3,male,,0,0,7.05,S
273,1,3,female,,1,0,15.5,Q


In [8]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    str    
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Embarked  418 non-null    str    
dtypes: float64(2), int64(4), str(2)
memory usage: 26.3 KB


In [9]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']), df.Survived, random_state=0, test_size = 0.2)

In [11]:
X_train.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
103,3,male,26.0,0,0,7.775,S
86,3,female,27.0,0,0,7.8792,Q
355,1,male,50.0,0,0,26.0,S
392,3,male,13.0,0,2,20.25,S
136,3,male,26.5,0,0,7.225,C


In [12]:
X_train.shape

(334, 7)

In [13]:
X_test.shape

(84, 7)

In [14]:
#Column Transformer
trf1 = ColumnTransformer(transformers=[
    ('impute_age',SimpleImputer(),[2]),
    ('impute_fare',SimpleImputer(),[5])
    # for string: ('impute_embarked',SimpleImputer(strategy = 'most_frequent'),[-1])
], remainder='passthrough')

In [15]:
#One Hot Encoding
trf2 = ColumnTransformer([
    ('ohe_sex_enbarked', OneHotEncoder(sparse_output = False, handle_unknown= 'ignore'),[2,6] )
], remainder = 'passthrough')

In [16]:
#Scaling
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])
#slice -> split all the range 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# 0 to 10, why? -> doing scaling to all the features
# total : 7  to (5 + sex ->2, embarked -> 3) total = 10

In [17]:
#FeatureSeletion
trf4 = SelectKBest(score_func=chi2, k=10)

In [18]:
# train the model
trf5= DecisionTreeClassifier()

### Create Pipeline

In [19]:
#recommended way 
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5',trf5)
])

# we can use make_pipeline
# make_pipeline(trf1,trf2,trf3,trf4,trf5)