# Building ML Pipeline

- Imputation
- Feature Scaling
- PCA (dimentionality reduction)
- One - Hot Encoding
- Fine - Tuning the estimator

1. LOADING LIBRARIES AND DATASET

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
churn_df = pd.read_csv('/content/Churn_Modelling.csv')

In [None]:
churn_df.shape

(10000, 14)

In [None]:
churn_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


2. DELETING UNWANTED COLUMNS

In [None]:
# dropping unwanted features
churn_df.drop(columns = ['RowNumber', 'CustomerId', 'Surname'], inplace = True)

In [None]:
churn_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


3. TRAIN - TEST SPLIT

In [None]:
# dividing the datasets into input features and target

X = churn_df.drop(columns = ['Exited'])
y = churn_df['Exited']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(f'Count of rows in Training set : {X_train.shape[0]}')
print(f'Count of rows in Testing set : {X_test.shape[0]}')

Count of rows in Training set : 8000
Count of rows in Testing set : 2000


4. SEPERATE PIPELINE FOR NUMERICAL AND CATEGORICAL DATA

In [None]:
# pipeline for numerical data

num_pipeline = Pipeline([
    ('num_imputation', SimpleImputer(strategy = 'mean')),
    ('feature_scaling', MinMaxScaler()),
    ('pca', PCA(0.98))
])

num_pipeline

In [None]:
# pipeline for categorical data

catg_pipeline = Pipeline([
    ('catg_impuation', SimpleImputer(fill_value = 'missing', strategy = 'constant')),
    ('one_hot_encoding', OneHotEncoder(sparse = False, handle_unknown = 'ignore'))
])

catg_pipeline

5. BUILDING THE MAJOR PIPELINE

In [None]:
# creating the list of columns acc to thier datatype

num_cols = X.select_dtypes(include = np.number).columns.tolist()
cat_cols = X.select_dtypes(include = 'object').columns.tolist()

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
dtypes: float64(2), int64(6), object(2)
memory usage: 781.4+ KB


In [None]:
preprocessor = ColumnTransformer([
    ('categorical', catg_pipeline, cat_cols),
    ('numerical', num_pipeline, num_cols)
])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('estimator', RandomForestClassifier())

])

pipe.fit(X_train, y_train)



In [None]:
print(f'Model Accuracy (before fine-tuning) : {pipe.score(X_test, y_test) * 100}%')

Model Accuracy (before fine-tuning) : 86.45%


6. FINE-TUNING THE ESTIMATOR

In [None]:
parameters = {
    'estimator__n_estimators' : [100, 150, 200],
    'estimator__max_depth' : [5, 7, 10, 15],
    'estimator__min_samples_split' : [2, 3, 4],
    'estimator__max_features' : [2, 4, 6, 8, 10],
}

grid_search = GridSearchCV(
    pipe,
    param_grid = parameters,
    n_jobs = 1
)

grid_search.fit(X_train, y_train)

In [None]:
# ?RandomForestClassifier

In [41]:
grid_search.best_params_

{'estimator__max_depth': 15,
 'estimator__max_features': 4,
 'estimator__min_samples_split': 3,
 'estimator__n_estimators': 200}

7. PUTTING THE HYPERPARAMETERS IN THE PIPELINE

In [47]:
pipe2 = Pipeline([
    ('preprocessor', preprocessor),
    ('estimator', RandomForestClassifier(n_jobs = 1,
                                         random_state=42,
                                         max_depth=15,
                                         max_features=4,
                                         min_samples_split=3,
                                         n_estimators=200))

])

pipe2.fit(X_train, y_train)

In [48]:
print(f'Model Accuracy (before fine-tuning) : {pipe2.score(X_test, y_test) * 100}%')

Model Accuracy (before fine-tuning) : 86.2%
