In [21]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [22]:
titanic = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
titanic.shape

(891, 12)

In [24]:
titanic.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [25]:
# Drop columns that are not needed (to avoid string columns)
titanic = titanic.drop(columns=['Name', 'Ticket', 'PassengerId'])

**Apply Feature Creation preprocessing step on the Titanic dataset to create a Family Size feature which calculates Family Size for each Passenger using following Equation. FamilySize = SibSp + Parch + 1**

In [26]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

In [27]:
titanic.shape

(891, 10)

In [28]:
titanic.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,FamilySize
0,0,3,male,22.0,1,0,7.25,,S,2
1,1,1,female,38.0,1,0,71.2833,C85,C,2


**Apply ColumnTransformer, FunctionTransformer and Sklearn Pipeline on the Titanic dataset.**

In [36]:
titanic.nunique()

Unnamed: 0,0
Survived,2
Pclass,3
Sex,2
Age,88
SibSp,7
Parch,7
Fare,248
Cabin,147
Embarked,3
FamilySize,9


In [33]:
preprocessor = ColumnTransformer(transformers=[
    # Impute numerical missing values
    ('age_imputer', SimpleImputer(strategy='mean'), ['Age']),

    # Fill missing Cabin values with "Unknown" and encode
    ('cabin_encoder', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), ['Cabin']),

    # Encode 'Sex'
    ('sex_encoder', OneHotEncoder(sparse_output=False), ['Sex']),

    # Impute and encode 'Embarked'
    ('embarked_encoder', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), ['Embarked'])
], remainder='passthrough')

In [35]:
transformed_data = preprocessor.fit_transform(titanic)

transformed_df = pd.DataFrame(transformed_data)
print("\n✅ Transformed Titanic Data Shape:", transformed_df.shape)
transformed_df.head(3)


✅ Transformed Titanic Data Shape: (891, 160)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,159
0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,7.25,2.0
1,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,71.2833,2.0
2,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,3.0,0.0,0.0,7.925,1.0


# **Heart Dataset**

In [59]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

data = pd.read_csv("/content/heart_2022_no_nans.csv")
data.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [60]:
data.shape

(246022, 40)

In [61]:
data.isnull().sum()

Unnamed: 0,0
State,0
Sex,0
GeneralHealth,0
PhysicalHealthDays,0
MentalHealthDays,0
LastCheckupTime,0
PhysicalActivities,0
SleepHours,0
RemovedTeeth,0
HadHeartAttack,0


In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinCancer              24

In [65]:
binary_cols = [
    'AlcoholDrinkers', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'DifficultyWalking',
    'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus', 'ECigaretteUsage',
    'ChestScan', 'CovidPos'
]  # Columns with Yes/No or binary 0/1

numeric_cols = [
    'BMI', 'HeightInMeters', 'WeightInKilograms', 'SleepHours', 'PhysicalHealthDays', 'MentalHealthDays'
]

categorical_cols = [
    'Sex', 'RaceEthnicityCategory', 'AgeCategory', 'GeneralHealth'
]

def yes_no_to_binary(X):
    X = X.copy()
    for col in binary_cols:
        # Some columns may already be 0/1, map only if string
        if X[col].dtype == object:
            X[col] = X[col].map({'Yes':1, 'No':0})
    return X[binary_cols]

# Pipelines for numeric and categorical
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('binary_transform', FunctionTransformer(yes_no_to_binary), binary_cols),
    ('num_pipeline', numeric_pipeline, numeric_cols),
    ('cat_pipeline', categorical_pipeline, categorical_cols)
], remainder='drop')


X_preprocessed = preprocessor.fit_transform(data)
print("Preprocessed shape:", X_preprocessed.shape)

Preprocessed shape: (246022, 42)


In [66]:
#Convert to DataFrame with feature names
import numpy as np

feature_names = (
    binary_cols +
    numeric_cols +
    list(pipeline.named_steps['preprocessor']
         .named_transformers_['cat_pipeline']
         .named_steps['onehot']
         .get_feature_names_out(categorical_cols))
)

df_preprocessed = pd.DataFrame(X_preprocessed, columns=feature_names)
df_preprocessed.head()

Unnamed: 0,AlcoholDrinkers,HadHeartAttack,HadAngina,HadStroke,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,...,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,GeneralHealth_Excellent,GeneralHealth_Fair,GeneralHealth_Good,GeneralHealth_Poor,GeneralHealth_Very good
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
