In [None]:
# 💡 What is a ColumnTransformer?
# A ColumnTransformer allows you to selectively apply transformations (like scaling, encoding, or imputing) to specific columns of a dataset.
# This is especially useful when your dataset contains mixed data types — like numeric and categorical features.


In [1]:
import numpy as np 
import pandas as pd 

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [6]:
df['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

## Aam zindagi

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['PassengerId','Survived','Name','SibSp','Parch','Ticket','Fare']),df['Survived'],test_size=0.2)

In [9]:
x_train

Unnamed: 0,Pclass,Sex,Age,Cabin,Embarked
475,1,male,,A14,S
848,2,male,28.0,,S
498,1,female,25.0,C22 C26,S
615,2,female,24.0,,S
882,3,female,22.0,,S
...,...,...,...,...,...
648,3,male,,,S
627,1,female,21.0,D9,S
540,1,female,36.0,B22,S
743,3,male,24.0,,S


In [10]:
y_train

475    0
848    0
498    0
615    1
882    0
      ..
648    0
627    1
540    1
743    0
407    1
Name: Survived, Length: 712, dtype: int64

In [11]:
x_train.isnull().sum()

Pclass        0
Sex           0
Age         150
Cabin       547
Embarked      1
dtype: int64

In [12]:
df['Cabin'].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

## applying SimpleImputer for missing values 

In [13]:
## adding simpleimputer to the >>> Age, Cabin, EMbarked 
si = SimpleImputer(strategy='most_frequent')
x_train_Cabin_Embarked_Age = si.fit_transform(x_train[['Age','Cabin','Embarked']])
## also for test data 
x_test_Cabin_Embarked_Age = si.fit_transform(x_test[['Age','Cabin','Embarked']])

In [26]:
x_train_Cabin_Embarked_Age.shape

(712, 3)

## OrdinalEncoding 

In [15]:
## applying ordinalencoding on column pclass because it have only ordered data 

In [16]:
oe = OrdinalEncoder(categories=[[1, 2, 3]])
x_train_Pclass = oe.fit_transform(x_train[['Pclass']])

In [27]:
x_train_Pclass.shape

(712, 1)

## OneHotEncoding on >>> sex >> embarked >> cabin

In [38]:
# .astype(str)
# This converts all values in the 'Cabin' column to strings.

# This is useful if the column contains a mix of data types or missing values (NaN) that you want to handle as strings.

# .str.strip()
# This removes any leading or trailing whitespace from the string values.
# For example, ' C123 ' becomes 'C123'.

x_train['Cabin'] = x_train['Cabin'].astype(str).str.strip()
x_test['Cabin'] = x_test['Cabin'].astype(str).str.strip()


In [19]:
x_train['Cabin'].nunique()

130

In [39]:
# sparse_output=False
# By default, OneHotEncoder returns a sparse matrix (memory-efficient).

# Setting sparse_output=False tells it to return a dense NumPy array instead — useful for small datasets or when debugging.
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
x_train_sex_embarked_cabin = ohe.fit_transform(x_train[['Sex', 'Embarked', 'Cabin']])

# Transform test using the already-fitted encoder
x_test_sex_embarked_cabin = ohe.transform(x_test[['Sex', 'Embarked', 'Cabin']])




In [40]:
x_train_sex_embarked_cabin.shape

(712, 133)

In [22]:
x_train_transformed = np.concatenate((
    x_train_Cabin_Embarked_Age,
    x_train_Pclass,
    x_train_sex_embarked_cabin   
),axis=1)



In [24]:
x_train_transformed.shape

(712, 137)

## Mentos Zindagi

In [28]:
from sklearn.compose import ColumnTransformer

In [33]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(strategy='most_frequent'), ['Age','Cabin','Embarked']),
    ('tnf2', OrdinalEncoder(categories=[[1,2,3]]), ['Pclass']),
    ('tnf3', OneHotEncoder(sparse_output=False,drop='first'), ['Sex','Embarked','Cabin'])  # ✅ FIXED HERE
],remainder='passthrough')

In [34]:
transformer

In [37]:
transformer.fit_transform(x_train).shape

(712, 137)