In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from joblib import dump

In [2]:
def load_data(filepath):
    """Load and preprocess the raw data"""
    df = pd.read_csv(filepath)
    
    # Basic preprocessing (adjust based on actual dataset)
    df = df.dropna()
    df = df.drop(['customerID'], axis=1)  # Assuming customerID exists
    
    # Convert TotalCharges to numeric (common issue in this dataset)
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df = df.dropna()
    
    return df

In [3]:
filepath = r"C:\Users\MEGA\OneDrive\Desktop\py\Dataset\WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = load_data(filepath)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
def build_pipeline():
    """Build the complete ML pipeline with preprocessing and model"""
    
    # Define numeric and categorical features
    numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 
                           'MultipleLines', 'InternetService', 'OnlineSecurity',
                           'OnlineBackup', 'DeviceProtection', 'TechSupport',
                           'StreamingTV', 'StreamingMovies', 'Contract',
                           'PaperlessBilling', 'PaymentMethod']
    
    # Create preprocessing transformers
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Create complete pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())  # Default classifier
    ])
    
    return pipeline

In [5]:
def train_model(X_train, y_train, pipeline):
    """Train model with hyperparameter tuning"""
    
    # Define parameter grids for different models
    param_grids = {
        'logistic': {
            'classifier': [LogisticRegression(max_iter=1000)],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear']
        },
        'random_forest': {
            'classifier': [RandomForestClassifier()],
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [None, 5, 10],
            'classifier__min_samples_split': [2, 5]
        }
    }
    
    # Perform GridSearchCV for each model type
    best_models = {}
    for model_type, param_grid in param_grids.items():
        print(f"\nTraining {model_type}...")
        grid_search = GridSearchCV(
            pipeline,
            param_grid=param_grid,
            cv=5,
            scoring='accuracy',
            n_jobs=-1,
            verbose=1
        )
        grid_search.fit(X_train, y_train)
        best_models[model_type] = grid_search.best_estimator_
        
        print(f"Best {model_type} params:", grid_search.best_params_)
        print(f"Best {model_type} score: {grid_search.best_score_:.4f}")
    
    return best_models


In [6]:
def evaluate_models(models, X_test, y_test):
    """Evaluate models on test set"""
    results = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'report': report
        }
        
        print(f"\n{name} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
    
    return results


In [7]:
def save_pipeline(pipeline, filepath):
    """Save the trained pipeline to disk"""
    dump(pipeline, filepath)
    print(f"Pipeline saved to {filepath}")

In [8]:
def main():
    # Load and prepare data
    df = load_data(filepath)
    X = df.drop('Churn', axis=1)
    y = df['Churn'].map({'Yes': 1, 'No': 0})  # Convert to binary
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Build and train pipeline
    pipeline = build_pipeline()
    models = train_model(X_train, y_train, pipeline)
    
    # Evaluate models
    results = evaluate_models(models, X_test, y_test)
    
    # Save the best model
    best_model = results['logistic']['model']
    save_pipeline(best_model, 'churn_pipeline.joblib')

if __name__ == "__main__":
    main()


Training logistic...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best logistic params: {'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best logistic score: 0.8025

Training random_forest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best random_forest params: {'classifier': RandomForestClassifier(), 'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best random_forest score: 0.8030

logistic Results:
Accuracy: 0.7996
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.64      0.56      0.60       374

    accuracy                           0.80      1407
   macro avg       0.74      0.72      0.73      1407
weighted avg       0.79      0.80      0.79      1407


random_forest Results:
Accuracy: 0.7882
Classification Rep