In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
df = pd.read_csv('../data/processed/Totalchargesfixed.csv')
df.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # 1. Tenure categories
        X['TenureCategory'] = pd.cut(
            X['tenure'],
            bins=[0, 12, 36, np.inf],
            labels=['New', 'Established', 'Loyal']
        )
        
        # 2. Service adoption score
        service_cols = [
            'PhoneService', 'MultipleLines', 'InternetService', 
            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
            'TechSupport', 'StreamingTV', 'StreamingMovies'
        ]
        X['ServiceAdoptionScore'] = X[service_cols].apply(
            lambda row: sum(row == 'Yes'), axis=1
        )
        
        # 3. Average monthly charges per service
        X['AvgChargesPerService'] = X['MonthlyCharges'] / (X['ServiceAdoptionScore'] + 1)
        
        # 4. Payment reliability indicator
        X['PaymentReliability'] = np.where(
            (X['Contract'] == 'Month-to-month') & (X['PaperlessBilling'] == 'Yes'),
            'Low',
            'High'
        )
        
        return X

# ====================================
# Preprocessing Pipeline
# ====================================

# Separate target variable
y = df['Churn']
X = df.drop(columns=['Churn'])

# Apply feature engineering
feat_eng = FeatureEngineering()
X_eng = feat_eng.fit_transform(X)

# Identify categorical & numeric columns
categorical_cols = X_eng.select_dtypes(include=['object', 'category']).columns
numeric_cols = X_eng.select_dtypes(include=['int64', 'float64']).columns

# Define preprocessing steps
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_transformer, categorical_cols),
        ('numeric', numeric_transformer, numeric_cols)
    ]
)

# Final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit-transform
X_processed = pipeline.fit_transform(X_eng)

# ====================================
# Convert back to DataFrame
# ====================================

# Get feature names
cat_features = pipeline.named_steps['preprocessor'].named_transformers_['categorical']\
    .named_steps['onehot'].get_feature_names_out(categorical_cols)
num_features = numeric_cols

final_feature_names = list(cat_features) + list(num_features)

# Convert to DataFrame
X_final = pd.DataFrame(
    X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
    columns=final_feature_names,
    index=X_eng.index
)

# Add target column back
X_final['Churn'] = y.values

print(X_final.shape)
X_final.head()

(7043, 53)


Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,...,TenureCategory_New,PaymentReliability_High,PaymentReliability_Low,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,ServiceAdoptionScore,AvgChargesPerService,Churn
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,-0.439916,-1.277445,-1.160323,-0.992611,-1.145997,-0.080263,No
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,-0.439916,0.066327,-0.259629,-0.172165,-0.176011,-0.195062,No
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,-0.439916,-1.236724,-0.36266,-0.958066,-0.176011,-0.324472,Yes
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,-0.439916,0.514251,-0.746535,-0.193672,-0.176011,-0.806627,No
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,-0.439916,-1.236724,0.197365,-0.938874,-1.145997,3.330307,Yes
