# Pipelines in Machine Learning using sklearn

Trainer : - Rajesh Jakhotia

- Pipeline allows you to sequentially apply a list of transformers to preprocess the data and, if desired, conclude the sequence with a final predictor for predictive modeling.
Ref:- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

### Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale 
from sklearn.linear_model import LogisticRegression

### Load the Dataset

In [2]:
#Load the Dataset
dev = pd.read_csv("DEV_SAMPLE.csv")
holdout = pd.read_csv("HOLDOUT_SAMPLE.csv")

print( len(dev),  len(holdout))

14000 6000


### Variable Transformation

In [3]:
dev["Balance_Standardized"] = scale(dev["Balance"]) 
print(f"Mean : {round(dev["Balance_Standardized"].mean(),2)}")
print(f"Standard Deviation : {round(dev["Balance_Standardized"].std(),2)}")

Mean : 0.0
Standard Deviation : 1.0


### Build Logistic Regression Model

In [4]:
X = pd.DataFrame(dev.loc[:, "Balance_Standardized"])
y = dev["Target"]

In [5]:
## Running one variable Logistic Regression
mylogit = LogisticRegression(random_state=0).fit(X, y)

## Apply Model on Hold-Out sample for prediction

#### Variable Transformation : Standardization step on hold-out data

In [6]:
holdout["Balance_Standardized"] = scale(holdout["Balance"]) 
X_ho = pd.DataFrame(holdout.loc[:, "Balance_Standardized"])

In [7]:
y_ho_pred = mylogit.predict(X_ho)
y_ho_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### What is wrong in the above step?

## Application of Model on a new record

In [8]:
df = holdout.iloc[0, 0:8]
df

Cust_ID            C12935
Target                  0
Age                    26
Gender                  M
Balance          67291.63
Occupation           SENP
No_OF_CR_TXNS           6
AGE_BKT             26-30
Name: 0, dtype: object

### How will you apply the model on the above record?

# Let's Apply Pipelines

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [10]:
steps = [("standard_scaler", StandardScaler()),
         ("classifier", LogisticRegression())
        ]

In [11]:
pipe = Pipeline(steps)

## Visualize the Pipeline

In [12]:
from sklearn import set_config
set_config(display="diagram")
pipe

In [13]:
pipe.fit(X,y)

In [14]:
y_pred = pipe.predict(X_ho)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)


## What if I want the predicted probabilities and not the class?

## What if I just want to do the transformation?

In [15]:
pipe['standard_scaler'].transform(X_ho)

array([[-0.47747142],
       [ 0.01337676],
       [-0.47441505],
       ...,
       [-0.28250571],
       [-0.69576341],
       [-0.58461036]])

In [16]:
pipe.named_steps['standard_scaler'].transform(X_ho)

array([[-0.47747142],
       [ 0.01337676],
       [-0.47441505],
       ...,
       [-0.28250571],
       [-0.69576341],
       [-0.58461036]])

## Some more Complex Pipeline

In [17]:
from sklearn.impute import SimpleImputer
numeric_pipe = Pipeline(
    steps = [("mean_imputation", SimpleImputer(missing_values = np.nan, strategy = "mean")),
             ("scaling", StandardScaler())
            ])
numeric_pipe

In [18]:
from sklearn.preprocessing import OneHotEncoder
categorical_pipe = Pipeline(
    steps = [("fill_missing", SimpleImputer(fill_value="missing", strategy = "constant")),
             ("one_hot", OneHotEncoder(handle_unknown = "ignore"))
            ])
categorical_pipe

In [19]:
dev.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period,Balance_Standardized
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15,-0.315897
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13,-0.163648
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5,0.678502
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18,-0.699001
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31,-0.599963


In [20]:
dev["Gender"].value_counts()

Gender
M    10017
F     3846
O      137
Name: count, dtype: int64

In [21]:
dev["Gender"].replace("O", np.nan, inplace=True)

In [22]:
dev["Gender"].value_counts()

Gender
M    10017
F     3846
Name: count, dtype: int64

In [23]:
from sklearn.compose import ColumnTransformer
col_pre_processor = ColumnTransformer(
    [("categorical", categorical_pipe, ["Gender", "Occupation"]),
     ("numerical", numeric_pipe, ["Age", "SCR"])
    ])
col_pre_processor

In [24]:
from sklearn.compose import ColumnTransformer
col_pre_processor = ColumnTransformer(
    [("categorical", categorical_pipe, ["Gender"]),
      ("numerical", numeric_pipe, ["Age", "SCR"])
    ])
col_pre_processor

In [25]:
col_trf_array = col_pre_processor.fit_transform(dev)

In [26]:
col_pre_processor.get_feature_names_out()

array(['categorical__Gender_F', 'categorical__Gender_M',
       'categorical__Gender_missing', 'numerical__Age', 'numerical__SCR'],
      dtype=object)

In [27]:
dev2=pd.DataFrame(data = col_trf_array, columns = col_pre_processor.get_feature_names_out() )
dev2.head()

Unnamed: 0,categorical__Gender_F,categorical__Gender_M,categorical__Gender_missing,numerical__Age,numerical__SCR
0,0.0,1.0,0.0,0.265642,1.414092
1,0.0,1.0,0.0,1.419077,0.809782
2,1.0,0.0,0.0,-0.782935,0.99337
3,1.0,0.0,0.0,0.685073,-0.777489
4,1.0,0.0,0.0,0.055927,0.744761


In [28]:
dev3 = pd.concat([dev, dev2], axis=1)
dev3.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period,Balance_Standardized,categorical__Gender_F,categorical__Gender_M,categorical__Gender_missing,numerical__Age,numerical__SCR
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15,-0.315897,0.0,1.0,0.0,0.265642,1.414092
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13,-0.163648,0.0,1.0,0.0,1.419077,0.809782
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5,0.678502,1.0,0.0,0.0,-0.782935,0.99337
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18,-0.699001,1.0,0.0,0.0,0.685073,-0.777489
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31,-0.599963,1.0,0.0,0.0,0.055927,0.744761


In [29]:
from sklearn.pipeline import make_pipeline
pipe2 = make_pipeline(col_pre_processor, LogisticRegression())
pipe2

## Thank You