# Preprocessing & Pipelines

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

print('imports ok')

imports ok


In [2]:
# Load data

df = pd.read_csv('../data/Churn_Modelling.csv')
# drop unnamed first column if present
if df.columns[0] == '' or str(df.columns[0]).startswith('Unnamed'):
    df.drop(df.columns[0], axis=1, inplace=True)

print('shape:', df.shape)
display(df.head())

shape: (10000, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Missing values and duplicates
print('Missing values (count & percent):')
missing = df.isna().sum().to_frame('missing')
missing['pct'] = missing['missing'] / len(df) * 100
display(missing.sort_values('pct', ascending=False))

print('\nDuplicates before:', df.duplicated().sum())
df.drop_duplicates(inplace=True)
print('Duplicates after:', df.duplicated().sum())

Missing values (count & percent):


Unnamed: 0,missing,pct
RowNumber,0,0.0
CustomerId,0,0.0
Surname,0,0.0
CreditScore,0,0.0
Geography,0,0.0
Gender,0,0.0
Age,0,0.0
Tenure,0,0.0
Balance,0,0.0
NumOfProducts,0,0.0



Duplicates before: 0
Duplicates after: 0


In [4]:
# Identify churn/target column
possible = ['Exited','Churn','churn','is_churn','IsChurn']
churn_col = next((c for c in possible if c in df.columns), None)
if churn_col is None:
    raise KeyError('Could not find churn column; expected one of: ' + str(possible))
print('Using churn/target column:', churn_col)

Using churn/target column: Exited


In [5]:
# Feature type consistency
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
# remove target from feature lists if present
if churn_col in numeric_cols:
    numeric_cols.remove(churn_col)
if churn_col in cat_cols:
    cat_cols.remove(churn_col)

print('Numeric features:', numeric_cols)
print('Categorical features:', cat_cols)

Numeric features: ['RowNumber', 'CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
Categorical features: ['Surname', 'Geography', 'Gender']


In [6]:
# Build preprocessing pipelines

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
])

# A minimal pipeline (no estimator) to produce processed features; add estimator later
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

print('pipeline ready')

pipeline ready


In [7]:
# Train/test split and fit pipeline (avoid leakage: fit only on train)

X = df.drop(columns=[churn_col])
y = df[churn_col]

# use stratify when possible to preserve class balance
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
except Exception:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit pipeline on training features
pipeline.fit(X_train)

X_train_trans = pipeline.transform(X_train)
X_test_trans = pipeline.transform(X_test)

print('Transformed shapes -> train:', X_train_trans.shape, 'test:', X_test_trans.shape)

Transformed shapes -> train: (8000, 2635) test: (2000, 2635)


In [8]:
# Inspect resulting feature names (works with sklearn >=1.0)
try:
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    print('Feature names length:', len(feature_names))
except Exception:
    print('Could not call get_feature_names_out() â€” inspect ColumnTransformer components instead')

Feature names length: 2635


In [9]:
# Save preprocessing pipeline
joblib.dump(pipeline, 'preprocessing_pipeline.joblib')
print('Saved pipeline to preprocessing_pipeline.joblib')

Saved pipeline to preprocessing_pipeline.joblib
