In [391]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [393]:
df = pd.read_csv('Titanic-Dataset.csv')

In [395]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [397]:
df.shape

(891, 12)

In [399]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [401]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [403]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [405]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [407]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [409]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [411]:
df.Sex.unique()

array(['male', 'female'], dtype=object)

In [413]:
X=df.drop(columns=['Survived'])
y=df['Survived']

In [415]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [417]:
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.2,
                                                random_state=42)

In [419]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64

In [421]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


## Making transformers

### trf1

In [423]:
#imputation

trf1=ColumnTransformer([
    ('Impute_age',SimpleImputer(),[2]),
    ('Impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')
# no.col=7
#after applyinh this the order may change and hence we need to work according to it

In [447]:
X1 = trf1.fit_transform(X_train)
print("After trf1:", type(X1), X1[:5])
#notice the change in order of columns

After trf1: <class 'numpy.ndarray'> [[45.5 'S' 1 'male' 0 0 28.5]
 [23.0 'S' 2 'male' 0 0 13.0]
 [32.0 'S' 3 'male' 0 0 7.925]
 [26.0 'S' 3 'male' 1 0 7.8542]
 [6.0 'S' 3 'female' 4 2 31.275]]


In [164]:
# ohe=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
# print(ohe.fit_transform(X_train.iloc[:, [1, 6]]))

In [425]:
# print(X_train.iloc[:, [1, 6]].head())
# X_train.iloc[:, [1, 6]] = X_train.iloc[:, [1, 6]].astype(str)

### trf2

In [429]:
#encoding
trf2=ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,3])
],remainder='passthrough')
# no.col=10---->age=2,embarked=3

### trf3

In [431]:
#scaling
trf3=ColumnTransformer([
    ('scaled_age',MinMaxScaler(),slice(0,10))
],remainder='passthrough')

my_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

print(my_list[slice(0, 10)])  # Equivalent to my_list[0:10]

Output: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### trf4

In [433]:
#featureSelection
trf4=SelectKBest(score_func=chi2,k=8)

### trf5

In [435]:
#training
trf5 = DecisionTreeClassifier()

## Creating pipeline

In [367]:
pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5),
])

## Pipeline Vs make_pipeline
Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [437]:
pipe=make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [439]:
pipe.fit(X_train,y_train)

In [441]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('Impute_age', SimpleImputer(), [2]),
                                 ('Impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 3])]),
 'columntransformer-3': ColumnTransformer(remainder='passthrough',
                   transformers=[('scaled_age', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x00000273D0956B60>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [343]:
# X_transformed = trf2.fit_transform(X_train)
# # X_transformed
# type(X_transformed)

numpy.ndarray

In [180]:
# pipeline = Pipeline([
#     ('preprocessor', trf2)
# ])

# # Fit and transform data (assuming `df` is a Pandas DataFrame)
# transformed_data = pipeline.fit_transform(X_train)

In [147]:
# transformed_data

array([[  0.    ,   1.    ,   0.    , ...,   0.    ,   0.    ,  28.5   ],
       [  0.    ,   1.    ,   0.    , ...,   0.    ,   0.    ,  13.    ],
       [  0.    ,   1.    ,   0.    , ...,   0.    ,   0.    ,   7.925 ],
       ...,
       [  0.    ,   1.    ,   0.    , ...,   2.    ,   0.    ,  14.1083],
       [  1.    ,   0.    ,   0.    , ...,   1.    ,   2.    , 120.    ],
       [  0.    ,   1.    ,   0.    , ...,   0.    ,   1.    ,  77.2875]])

In [445]:
X1 = trf1.fit_transform(X_train)
print("After trf1:", type(X1), X1[:5])

X2 = trf2.fit_transform(X1)
print("After trf2:", type(X2), X2[:5])

X3 = trf3.fit_transform(X2)
print("After trf3:", type(X3), X3[:5])




After trf1: <class 'numpy.ndarray'> [[45.5 'S' 1 'male' 0 0 28.5]
 [23.0 'S' 2 'male' 0 0 13.0]
 [32.0 'S' 3 'male' 0 0 7.925]
 [26.0 'S' 3 'male' 1 0 7.8542]
 [6.0 'S' 3 'female' 4 2 31.275]]
After trf2: <class 'numpy.ndarray'> [[0.0 0.0 1.0 0.0 1.0 45.5 1 0 0 28.5]
 [0.0 0.0 1.0 0.0 1.0 23.0 2 0 0 13.0]
 [0.0 0.0 1.0 0.0 1.0 32.0 3 0 0 7.925]
 [0.0 0.0 1.0 0.0 1.0 26.0 3 1 0 7.8542]
 [0.0 0.0 1.0 1.0 0.0 6.0 3 4 2 31.275]]
After trf3: <class 'numpy.ndarray'> [[0.         0.         1.         0.         1.         0.56647399
  0.         0.         0.         0.0556283 ]
 [0.         0.         1.         0.         1.         0.28373963
  0.5        0.         0.         0.02537431]
 [0.         0.         1.         0.         1.         0.39683338
  1.         0.         0.         0.01546857]
 [0.         0.         1.         0.         1.         0.32143755
  1.         0.125      0.         0.01533038]
 [0.         0.         1.         1.         0.         0.07011812
  1.   

In [451]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [453]:
X1 = trf1.fit_transform(X_train)
print("After trf1 (Imputation):", type(X1))
pd.DataFrame(X1).head()  # Convert to DataFrame for better readability


After trf1 (Imputation): <class 'numpy.ndarray'>


Unnamed: 0,0,1,2,3,4,5,6
0,45.5,S,1,male,0,0,28.5
1,23.0,S,2,male,0,0,13.0
2,32.0,S,3,male,0,0,7.925
3,26.0,S,3,male,1,0,7.8542
4,6.0,S,3,female,4,2,31.275


### using pipeline to predict

In [375]:
#using pipeline to predict
y_pred = pipe.predict(X_test)

In [482]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8044692737430168

## Cross Validation using Pipeline

In [484]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.7922781443908204

## GridSearch using Pipeline

In [486]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [385]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [488]:
grid.best_score_

0.8033093666896484

In [490]:
grid.best_params_

{'trf5__max_depth': 3}