# 02 — Data Cleaning & Preprocessing

This notebook converts the EDA work into reproducible cleaning steps and saves a processed CSV and a fitted preprocessor.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import os


In [2]:
RAW = '../data/raw/Telco_Customer_Churn.csv'
PROCESSED = '../data/processed/Telco_Customer_Churn.csv'
PREPROC_PATH = '../models/preprocessor.pkl'
os.makedirs(os.path.dirname(PROCESSED), exist_ok=True)
os.makedirs(os.path.dirname(PREPROC_PATH), exist_ok=True)
df = pd.read_csv(RAW)
df.shape


(7043, 21)

In [3]:
def basic_cleaning(df):
    df = df.copy()
    if 'customerID' in df.columns:
        df = df.drop(columns=['customerID'])
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    for c in df.select_dtypes(include='object').columns:
        df[c] = df[c].str.strip()
    return df

df = basic_cleaning(df)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [4]:
print('Missing before:', df.isnull().sum().sum())
if 'TotalCharges' in df.columns:
    mask = df['TotalCharges'].isnull()
    print('TotalCharges null rows:', mask.sum())
    df.loc[mask & (df.get('tenure',0)==0), 'TotalCharges'] = 0
    df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
print('Missing after:', df.isnull().sum().sum())


Missing before: 11
TotalCharges null rows: 11
Missing after: 0


In [5]:
def add_features(df):
    df = df.copy()
    if 'MonthlyCharges' in df.columns:
        df['high_monthly'] = (df['MonthlyCharges'] > df['MonthlyCharges'].median()).astype(int)
    if 'tenure' in df.columns:
        df['tenure_group'] = pd.cut(df['tenure'], bins=[-1, 12, 24, 48, 100], labels=['0-12','13-24','25-48','48+'])
    return df

df = add_features(df)
df.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,high_monthly,tenure_group
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0,0-12
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,No,No,One year,No,Mailed check,56.95,1889.5,No,0,25-48
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0,0-12
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,0,25-48
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,0-12


In [6]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols = [c for c in num_cols if c != 'Churn']
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'Churn']

numeric_pipeline = Pipeline([('scaler', StandardScaler())])
categorical_pipeline = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor = ColumnTransformer([('num', numeric_pipeline, num_cols), ('cat', categorical_pipeline, cat_cols)])

X = df.drop(columns=['Churn']) if 'Churn' in df.columns else df.copy()
preprocessor.fit(X)
joblib.dump({'preprocessor': preprocessor, 'num_cols': num_cols, 'cat_cols': cat_cols}, PREPROC_PATH)
print('Saved preprocessor to', PREPROC_PATH)


Saved preprocessor to ../models/preprocessor.pkl


In [7]:
df.to_csv(PROCESSED, index=False)
print('Processed CSV saved to', PROCESSED)


Processed CSV saved to ../data/processed/Telco_Customer_Churn.csv


## Next steps
- Move cleaning functions to `src/data_preprocessing.py` and `src/feature_engineering.py`
- Use `03_model_training.ipynb` to train models with this processed data
