In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

In [4]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [7]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [159]:
# transformer 1 -> impute age, embarked

t1 = ColumnTransformer(transformers=[
    ('impute_age', SimpleImputer(strategy='mean'), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [160]:
# obtain the new column indices

t1.fit(X)
t1.get_feature_names_out().reshape(-1, 1)

array([['Age'],
       ['Embarked'],
       ['Pclass'],
       ['Sex'],
       ['SibSp'],
       ['Parch'],
       ['Fare']], dtype=object)

In [163]:
# transformer 2 -> ohe sex, embarked

t2 = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output=False), [1, 3]),    # column indices obtained from the new indices
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [164]:
t2.fit_transform(t1.fit_transform(X))
t2.get_feature_names_out().reshape(-1,1)

array([['Embarked_C'],
       ['Embarked_Q'],
       ['Embarked_S'],
       ['Sex_female'],
       ['Sex_male'],
       ['Age'],
       ['Pclass'],
       ['SibSp'],
       ['Parch'],
       ['Fare']], dtype=object)

In [165]:
# transformer 3 -> scale age, fare

t3 = ColumnTransformer([
    ('scaler', MinMaxScaler(), [5, 9])
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [166]:
t3.fit_transform(t2.fit_transform(t1.fit_transform(X))).head()

Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Pclass,SibSp,Parch
0,0.271174,0.014151,0.0,0.0,1.0,0.0,1.0,3,1,0
1,0.472229,0.139136,1.0,0.0,0.0,1.0,0.0,1,1,0
2,0.321438,0.015469,0.0,0.0,1.0,1.0,0.0,3,0,0
3,0.434531,0.103644,0.0,0.0,1.0,1.0,0.0,1,1,0
4,0.434531,0.015713,0.0,0.0,1.0,0.0,1.0,3,0,0


In [209]:
# transformer 4 -> feature selection

t4 = SelectKBest(score_func=chi2, k=5)

In [210]:
# transformer 5 -> decision tree

t5 = DecisionTreeClassifier()

In [211]:
pipe = Pipeline([
    ('transformer_1', t1),
    ('transformer_2', t2),
    ('transformer_3', t3),
    ('transformer_4', t4),
    ('transformer_5', t5)
])  

In [212]:
# splitting data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0])

In [213]:
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8022388059701493