Simulate df

In [3]:
import pandas as pd
import numpy as np

np.random.seed(42)

n = 1000

df = pd.DataFrame({
    'customer_id': [f"CUST{i:04d}" for i in range(n)],
    'tenure_months': np.random.randint(1, 72, size=n),
    'monthly_charges': np.random.uniform(10,120,size=n),
    'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], size=n),
    'has_internet': np.random.choice(['Yes', 'No'], size=n, p=[0.8, 0.2]),
    'payment_method': np.random.choice(['Credit card', 'Bank transfer', 'Electronic check'], size=n),
    'churned': np.random.choice([0, 1], size=n, p=[0.73, 0.27])  # imbalanced
})

df['total_charges'] = df['tenure_months'] * df['monthly_charges']
df.head()

Unnamed: 0,customer_id,tenure_months,monthly_charges,contract_type,has_internet,payment_method,churned,total_charges
0,CUST0000,52,104.126611,One year,Yes,Bank transfer,0,5414.583798
1,CUST0001,15,101.324185,One year,Yes,Credit card,1,1519.862777
2,CUST0002,61,53.690188,Two year,No,Credit card,0,3275.101484
3,CUST0003,21,83.489365,Month-to-month,Yes,Bank transfer,0,1753.276665
4,CUST0004,24,32.548272,One year,Yes,Electronic check,0,781.15854


Preprocessing and Modeling Pipeline

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = df.drop(columns=['customer_id', 'churned'])
y = df['churned']

numeric_features = ['tenure_months', 'monthly_charges', 'total_charges']
categorical_features = ['contract_type', 'has_internet', 'payment_method']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       147
           1       0.38      0.21      0.27        53

    accuracy                           0.70       200
   macro avg       0.57      0.54      0.54       200
weighted avg       0.65      0.70      0.67       200



Save the Trained Pipeline

In [5]:
import joblib
import os

os.makedirs("models", exist_ok=True)

joblib.dump(pipeline, 'models/churn_model.joblib')

print('model saved')

model saved
