<a href="https://colab.research.google.com/github/Shashank975/Practice-Projects-ML/blob/main/column_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import the Important Library

In [None]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline,make_pipeline


import seaborn as sns
from sklearn.tree import DecisionTreeClassifier

#Import the data

In [None]:
df  = sns.load_dataset('titanic')
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
df.shape

(891, 15)

In [None]:
for i in df.columns:
  print(i)

survived
pclass
sex
age
sibsp
parch
fare
embarked
class
who
adult_male
deck
embark_town
alive
alone


In [None]:
# df.drop(columns=["embark_town","alive","alone","deck","who","adult_male","class"],inplace=True)


In [None]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
print(df["pclass"].value_counts())
print(df["sex"].value_counts())
print(df["survived"].value_counts())
print(df["sibsp"].value_counts())
print(df["parch"].value_counts())
print(df["embarked"].value_counts())

pclass
3    491
1    216
2    184
Name: count, dtype: int64
sex
male      577
female    314
Name: count, dtype: int64
survived
0    549
1    342
Name: count, dtype: int64
sibsp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64
parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64
embarked
S    644
C    168
Q     77
Name: count, dtype: int64


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [None]:
df['members'] = df['sibsp'] + df['parch']
df.drop(columns=['sibsp','parch'],inplace=True)

#Train Test Split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['survived']),df['survived'],test_size=0.2,random_state=42)


In [None]:
print(X_train.shape)
print(X_test.shape)
X_train

(712, 13)
(179, 13)


Unnamed: 0,pclass,sex,age,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,members
331,1,male,45.5,28.5000,S,First,man,True,C,Southampton,no,True,0
733,2,male,23.0,13.0000,S,Second,man,True,,Southampton,no,True,0
382,3,male,32.0,7.9250,S,Third,man,True,,Southampton,no,True,0
704,3,male,26.0,7.8542,S,Third,man,True,,Southampton,no,False,1
813,3,female,6.0,31.2750,S,Third,child,False,,Southampton,no,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,3,female,21.0,7.6500,S,Third,woman,False,,Southampton,yes,True,0
270,1,male,,31.0000,S,First,man,True,,Southampton,no,True,0
860,3,male,41.0,14.1083,S,Third,man,True,,Southampton,no,False,2
435,1,female,14.0,120.0000,S,First,child,False,B,Southampton,yes,False,3


**Where Does ColumnTransformer Get the Data From?**

ColumnTransformer does not execute immediately. It is just defined as part of your pipeline.

It is applied inside the pipeline during pipe.fit(X_train, y_train).

At that moment, it takes X_train as input and applies the transformations.

How Data Flows in ColumnTransformer
When trf1 is applied, X_train is passed through it.

The transformer extracts columns [2] and [4] from X_train` and applies the respective transformations:

Column 2 → Missing values filled with default mean (SimpleImputer).
Column 4 → Missing values filled with most frequent value.

All other columns are passed through unchanged (remainder='passthrough').

In [None]:
#impute Missing Values

trf1 = ColumnTransformer([
    ('age_impute',SimpleImputer(),[2]),
    ('embarked',SimpleImputer(strategy='most_frequent'),[4])

],remainder='passthrough')

In [None]:
trf2 = ColumnTransformer([
    ("oeh_sex_embarked",OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,4]),
],remainder='passthrough')



In [None]:
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])


In [None]:
trf4 = SelectKBest(score_func=chi2,k=8)

In [None]:
trf5 = DecisionTreeClassifier()

In [None]:
pipe = Pipeline([
    ('transformation1',trf1),
    ('transformation2',trf2),
    ('transformation3',trf3),
    ('transformation4',trf4),
    ('transformation5',trf5)
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)*100

62.56983240223464

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=20,scoring='accuracy').mean()

0.639126984126984