In [34]:
# A Pipeline in Machine Learning is a tool (in Scikit-Learn) that lets you connect multiple data-processing steps and a model into one clean workflow.

# Think of it like an assembly line:

# üëâ Raw data goes in
# üëâ Each step transforms it
# üëâ A final model is trained on the processed data

# Instead of writing long messy code for each step, a pipeline organizes everything in the correct order.
#---------------------------------------------------------------------------------------------------------------------------
# What is a Pipeline in ML?

# A Pipeline is a sequence of steps where each step is:
# A transformer (e.g., imputer, scaler, encoder)
# or a final estimator (model)

# Example steps:
# Handle missing values
# Scale features
# One-hot encode categorical variables
# Train a model


# A pipeline chain is made of all these:
# Pipeline([
#     ("imputer", SimpleImputer()),
#     ("scaler", StandardScaler()),
#     ("model", LinearRegression())
# ])
#---------------------------------------------------------------------------------------------------------------------------
# Summary :

# A pipeline is a structured, safe, and clean way to:
# preprocess data
# avoid leakage
# maintain consistency
# prepare data for ML models
# run cross-validation easily

# It lets you combine multiple preprocessing steps and the model itself into a single object.

In [34]:
# import pandas as pd
# import numpy as np

# data = pd.read_csv("4CHP_Data.csv")  # California housing data


In [58]:
import pandas as pd
import numpy as np

data = pd.read_csv("4CHP_Data.csv")  # California housing data

In [60]:
data["income_cat"] = pd.cut(data["median_income"], bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])  

In [61]:
from sklearn.model_selection import StratifiedShuffleSplit
Shuffled = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in Shuffled.split(data, data["income_cat"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [62]:
# Removing income cat column 
for set_ in (strat_train_set, strat_test_set) :
    set_.drop ("income_cat", axis =1, inplace = True)

In [63]:
TrainDf = strat_train_set.copy() 

In [None]:
TrainDf = TrainDf.drop("ocean_proximity", axis=1)

In [68]:
TrainDf

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,82700.0
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,268500.0
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,90400.0
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,140400.0
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,258100.0


In [78]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

my_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("MinMax", MinMaxScaler(feature_range=(-1,1))),
])


#-----------Pipeline Syntax-----------

# from sklearn.pipeline import Pipeline

# pipeline = Pipeline([
#     ("step_name1", transformer1),
#     ("step_name2", transformer2),
#     ("model", model)
# ])

In [79]:
my_pipeline = my_pipeline.fit_transform(TrainDf)

In [87]:
my_pipeline  # it is a Numpy array, with all tranformations done in one go using pipeline.

array([[-0.42430279,  0.27098831,  0.09803922, ..., -0.73711725,
        -0.76914801, -0.76453293],
       [ 0.41832669, -0.88310308, -0.76470588, ..., -0.71396565,
        -0.19485248,  0.09113364],
       [ 0.05776892, -0.39851222,  0.68627451, ..., -0.88872293,
        -0.67240452, -0.72082177],
       ...,
       [-0.6752988 ,  0.25398512,  0.84313725, ..., -0.93651979,
        -0.63037751, -0.48288461],
       [-0.67131474,  0.22635494, -0.49019608, ..., -0.81366692,
        -0.49014496,  0.00247422],
       [-0.55976096,  0.57917109,  0.01960784, ..., -0.92718447,
        -0.63697052, -0.80329566]])

In [None]:
# Column Transfromer :

# A ColumnTransformer allows you to:
# Apply one set of transformations to numerical columns
# Apply another set to categorical columns
# Leave some columns unchanged
# Combine all results into one final numpy array
# This makes preprocessing clean, organized, and scalable.

#--------------------------------------------------------------
# Why do we need ColumnTransformer?
# Because datasets often contain mixed data types:
# Example (California Housing dataset):

# Numerical columns ‚Üí
# median_income, total_rooms, population, etc.

# Categorical columns ‚Üí
# ocean_proximity

# You cannot apply:
# StandardScaler to text
# OneHotEncoder to numbers
# So you need a tool that says:

# üëâ ‚ÄúApply this to these columns, and that to those columns.‚Äù
# This is exactly what ColumnTransformer does.

In [89]:
# How ColumnTransformer Works  ----------EXAMPLE CODE------ONLY FOR UNDERSTANDING HERE----------

# You create pipelines for each type of column:
# 1Ô∏è‚É£ Numerical pipeline
# (impute missing values + scale features) 


# num_pipeline = Pipeline([
#     ("imputer", SimpleImputer(strategy="median")),
#     ("scaler", StandardScaler())
# ])


In [90]:
# 2Ô∏è‚É£ Categorical pipeline

# (impute missing values + one-hot encode)

# cat_pipeline = Pipeline([
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("onehot", OneHotEncoder())
# ])

In [91]:
# 3Ô∏è‚É£ Combine them using ColumnTransformer
# from sklearn.compose import ColumnTransformer

# full_pipeline = ColumnTransformer([
#     ("num", num_pipeline, numerical_columns),
#     ("cat", cat_pipeline, categorical_columns)
# ])

In [92]:
# Then simply:

# prepared_data = full_pipeline.fit_transform(df)

In [None]:
üß© Example Output Flow
df (original)
   ‚Üì
ColumnTransformer
   ‚Üí numerical ‚Üí impute ‚Üí scale
   ‚Üí categorical ‚Üí impute ‚Üí one-hot encode
   ‚Üì
Combined transformed array
   ‚Üì
Model training / prediction