In [None]:
# Data ---> Seprate(categorical, numerical) ---->
# Categorical data ---> SimpleImputer fill ---> Encode
# Numerical data ---> SimpleImputer fill ---> Standardize

# Apply ----> Algorithm(Logsitics)

# 🔷 Simple Definition
A Pipeline is a tool in machine learning that lets you bundle all steps of your workflow (like data cleaning, preprocessing, modeling) into one single object.

# ✅ Why Do We Use a Pipeline?
Without a pipeline:

You manually do each step:
Clean data
Encode categories
Scale numbers
Train model
With a pipeline:

You do everything in one command

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("covid_toy.csv")

In [3]:
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [5]:
X = df.drop(columns = ['has_covid'])
y= df['has_covid'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
random_state=42)

In [6]:
# Define the columns that need to be preprocessed
categorical_features = ['gender','city']
numeric_features=['age','fever']

In [7]:
# Create Trasnsformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # Corrected this line
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#Combine transformers
preprocessor = ColumnTransformer (
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#Create the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])
#Train the model
clf.fit(X_train, y_train)
# Evaluate the model
y_pred = clf.predict(X_test)

In [8]:
 y_pred

array(['No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes',
       'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No'],
      dtype=object)

In [9]:
from sklearn.metrics import accuracy_score

In [10]:
acc = accuracy_score(y_test,y_pred)

In [11]:
acc

0.65