In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
def load_data(file_path):
    """Load customer churn data"""
    df = pd.read_csv(file_path)
    print(f"Dataset shape: {df.shape}")
    return df

# 1. Basic Preprocessing
def preprocess_data(df):
    """Basic preprocessing steps"""
    # Create a copy to avoid modifying the original dataframe
    data = df.copy()
    
    # Drop customer ID as it's not predictive
    if 'CustomerID' in data.columns:
        data.drop('CustomerID', axis=1, inplace=True)
    
    # Convert 'TotalCharges' to numeric if it's not already
    if data['TotalCharges'].dtype == 'object':
        data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
        # Fill missing values with 0 or median
        data['TotalCharges'].fillna(0, inplace=True)
    
    # Convert binary categorical values to numeric (0/1)
    binary_cols = ['SeniorCitizen']
    for col in ['Gender', 'Partner', 'Dependents', 'PhoneService', 'Churn']:
        if col in data.columns:
            data[col] = data[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})
            binary_cols.append(col)
    
    # Create X (features) and y (target)
    y = data['Churn']
    X = data.drop('Churn', axis=1)
    
    return X, y, binary_cols

# 2. Feature Engineering Function
def engineer_features(X, y=None):
    """Create new features to improve model performance"""
    data = X.copy()
    
    # A. Customer Longevity Features
    
    # Tenure bins
    data['TenureBin'] = pd.cut(data['Tenure'], 
                              bins=[0, 12, 24, 36, 48, 60, float('inf')],
                              labels=[1, 2, 3, 4, 5, 6])
    data['IsNewCustomer'] = (data['Tenure'] <= 6).astype(int)
    data['IsLongTermCustomer'] = (data['Tenure'] >= 36).astype(int)
    
    # Average Monthly Spend
    data['AvgMonthlySpend'] = data['TotalCharges'] / (data['Tenure'] + 1)  # Add 1 to avoid division by zero
    
    # Spending pattern (is their monthly charge higher than their average?)
    data['SpendingPattern'] = (data['MonthlyCharges'] > data['AvgMonthlySpend']).astype(int)
    
    # B. Financial Features
    
    # Log transformation of charges (for Logistic Regression)
    data['LogMonthlyCharges'] = np.log1p(data['MonthlyCharges'])
    data['LogTotalCharges'] = np.log1p(data['TotalCharges'])
    
    # Price sensitivity flags
    month_charge_quantiles = data['MonthlyCharges'].quantile([0.25, 0.75]).values
    data['IsLowSpender'] = (data['MonthlyCharges'] <= month_charge_quantiles[0]).astype(int)
    data['IsHighSpender'] = (data['MonthlyCharges'] >= month_charge_quantiles[1]).astype(int)
    
    # C. Service Features
    
    # Create dummies for InternetService and Contract
    # These will be handled by the column transformer, but we'll create some combinations
    
    # Create service complexity score (number of services)
    # Assuming 'Yes' = 1 and 'No' = 0 for service columns after preprocessing
    service_cols = ['PhoneService']
    if 'InternetService' in data.columns:
        # For InternetService, create binary indicators
        data['HasFiberOptic'] = (data['InternetService'] == 'Fiber optic').astype(int)
        data['HasDSL'] = (data['InternetService'] == 'DSL').astype(int)
        data['HasNoInternet'] = (data['InternetService'] == 'No').astype(int)
        service_cols.extend(['HasFiberOptic', 'HasDSL'])
    
    # Contract Type
    if 'Contract' in data.columns:
        data['IsMonthToMonth'] = (data['Contract'] == 'Month-to-month').astype(int)
        data['IsOneYear'] = (data['Contract'] == 'One year').astype(int)
        data['IsTwoYear'] = (data['Contract'] == 'Two year').astype(int)
        
        # After creating the binary features, we can drop the original categorical columns
        # to avoid duplication (this helps with some machine learning algorithms)
        data.drop(['InternetService', 'Contract'], axis=1, inplace=True)
    
    # D. Interaction Features
    
    # Customer demographics + financial
    if 'SeniorCitizen' in data.columns:
        data['Senior_HighSpender'] = data['SeniorCitizen'] * data['IsHighSpender']
    
    if 'Partner' in data.columns:
        data['Partner_LongTerm'] = data['Partner'] * data['IsLongTermCustomer']
    
    # Service + Contract interaction
    if 'HasFiberOptic' in data.columns and 'IsMonthToMonth' in data.columns:
        data['FiberOptic_MonthToMonth'] = data['HasFiberOptic'] * data['IsMonthToMonth']
        # This is a high churn-risk group typically
    
    # E. Non-linear transformations (especially for Logistic Regression)
    
    # Polynomial features for Tenure
    data['Tenure_Squared'] = data['Tenure'] ** 2
    
    # Ratio features
    data['Charges_Tenure_Ratio'] = data['TotalCharges'] / (data['Tenure'] + 1)
    
    # F. Additional domain-specific features
    
    # Loyalty Index (higher for customers with long tenure and longer contracts)
    if 'IsOneYear' in data.columns and 'IsTwoYear' in data.columns:
        contract_score = data['IsMonthToMonth'] * 1 + data['IsOneYear'] * 2 + data['IsTwoYear'] * 3
        data['LoyaltyIndex'] = (data['Tenure'] / 72) * 0.5 + (contract_score / 3) * 0.5
    
    return data

# 3. Build preprocessing pipelines specific to each model
def build_preprocessors(X, binary_cols):
    """Build preprocessing pipelines for Random Forest and Logistic Regression"""
    
    # Identify column types
    categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
    numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] and col not in binary_cols]
    
    print(f"Categorical columns: {categorical_cols}")
    print(f"Numerical columns: {numerical_cols}")
    print(f"Binary columns: {binary_cols}")
    
    # Create preprocessing pipelines using list of column indices rather than names
    # to avoid the KeyError in cross-validation
    rf_transformers = []
    lr_transformers = []
    
    # Add numerical features
    if numerical_cols:
        rf_transformers.append(('num', 'passthrough', numerical_cols))
        lr_transformers.append(('num', StandardScaler(), numerical_cols))
        
    # Add categorical features
    if categorical_cols:
        rf_transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols))
        lr_transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols))
    
    # Add binary features
    if binary_cols:
        rf_transformers.append(('bin', 'passthrough', binary_cols))
        lr_transformers.append(('bin', 'passthrough', binary_cols))
    
    # For Random Forest
    rf_preprocessor = ColumnTransformer(
        transformers=rf_transformers,
        remainder='drop'
    )
    
    # For Logistic Regression
    lr_preprocessor = ColumnTransformer(
        transformers=lr_transformers,
        remainder='drop'
    )
    
    return rf_preprocessor, lr_preprocessor

# 4. Model Training and Evaluation
def train_and_evaluate_models(X, y, rf_preprocessor, lr_preprocessor):
    """Train and evaluate Random Forest and Logistic Regression models"""
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    
    # Define models
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
    
    # Create pipelines
    rf_pipeline = Pipeline([
        ('preprocessor', rf_preprocessor),
        ('classifier', rf_model)
    ])
    
    lr_pipeline = Pipeline([
        ('preprocessor', lr_preprocessor),
        ('classifier', lr_model)
    ])
    
    try:
        # Train models directly (skipping cross-validation for now to debug)
        print("\nTraining Random Forest model...")
        rf_pipeline.fit(X_train, y_train)
        
        print("Training Logistic Regression model...")
        lr_pipeline.fit(X_train, y_train)
        
        # Evaluate on test set
        rf_pred_proba = rf_pipeline.predict_proba(X_test)[:, 1]
        lr_pred_proba = lr_pipeline.predict_proba(X_test)[:, 1]
        
        rf_auc = roc_auc_score(y_test, rf_pred_proba)
        lr_auc = roc_auc_score(y_test, lr_pred_proba)
        
        print("\nTest Set Results:")
        print(f"Random Forest ROC AUC: {rf_auc:.4f}")
        print(f"Logistic Regression ROC AUC: {lr_auc:.4f}")
        
        # Generate confusion matrix and classification report for RF
        rf_pred = rf_pipeline.predict(X_test)
        print("\nRandom Forest Confusion Matrix:")
        print(confusion_matrix(y_test, rf_pred))
        print("\nRandom Forest Classification Report:")
        print(classification_report(y_test, rf_pred))
        
        # Once the basic models work, we can try cross-validation
        # Cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        print("\nPerforming Random Forest Cross-Validation:")
        try:
            rf_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=cv, scoring='roc_auc', error_score='raise')
            print(f"RF ROC AUC: {rf_scores.mean():.4f} (±{rf_scores.std():.4f})")
        except Exception as e:
            print(f"Random Forest cross-validation error: {e}")
        
        print("\nPerforming Logistic Regression Cross-Validation:")
        try:
            lr_scores = cross_val_score(lr_pipeline, X_train, y_train, cv=cv, scoring='roc_auc', error_score='raise')
            print(f"LR ROC AUC: {lr_scores.mean():.4f} (±{lr_scores.std():.4f})")
        except Exception as e:
            print(f"Logistic Regression cross-validation error: {e}")
        
        # Feature importance for Random Forest (with additional error handling)
        try:
            if hasattr(rf_pipeline.named_steps['classifier'], 'feature_importances_'):
                # Get feature importances
                importances = rf_pipeline.named_steps['classifier'].feature_importances_
                
                # Get the names of features after preprocessing
                # This is a simplified approach that may not perfectly map feature names
                # but will at least provide some insight
                print("\nTop feature importances (Random Forest):")
                feature_importance = pd.DataFrame({
                    'importance': importances
                })
                feature_importance = feature_importance.sort_values('importance', ascending=False)
                print(feature_importance.head(15))
        except Exception as e:
            print(f"Error getting feature importances: {e}")
            
    except Exception as e:
        print(f"Error during model training: {e}")
        import traceback
        traceback.print_exc()
        
    return rf_pipeline, lr_pipeline

# 5. Main function
def main():
    """Main execution function"""
    
    try:
        # 1. Load data - Allow passing file path as parameter
        import sys
        file_path = 'customer_churn.csv'
        if len(sys.argv) > 1:
            file_path = sys.argv[1]
        
        print(f"Loading data from: {file_path}")
        df = load_data(file_path)
        
        # 2. Basic preprocessing
        X, y, binary_cols = preprocess_data(df)
        print(f"Data shape after preprocessing: {X.shape}")
        
        # 3. Engineer features
        X_engineered = engineer_features(X, y)
        print(f"Shape after feature engineering: {X_engineered.shape}")
        
        # 4. Build preprocessors
        rf_preprocessor, lr_preprocessor = build_preprocessors(X_engineered, binary_cols)
        
        # 5. Train and evaluate models
        rf_model, lr_model = train_and_evaluate_models(X_engineered, y, rf_preprocessor, lr_preprocessor)
        
        # 6. Save models
        from joblib import dump
        dump(rf_model, 'random_forest_model.joblib')
        dump(lr_model, 'logistic_regression_model.joblib')
        
        print("\nFeature engineering pipeline completed successfully!")
        print("Models saved as 'random_forest_model.joblib' and 'logistic_regression_model.joblib'")
        
    except Exception as e:
        print(f"Error in main execution: {e}")
        import traceback
        traceback.print_exc()
    
main()

Loading data from: --f=c:\Users\riche\AppData\Roaming\jupyter\runtime\kernel-v399914cc4074d9c61efb3c5f4e893f0d21250b287.json
Error in main execution: [Errno 22] Invalid argument: '--f=c:\\Users\\riche\\AppData\\Roaming\\jupyter\\runtime\\kernel-v399914cc4074d9c61efb3c5f4e893f0d21250b287.json'


Traceback (most recent call last):
  File "D:\tmp\ipykernel_6104\1118306964.py", line 285, in main
    df = load_data(file_path)
         ^^^^^^^^^^^^^^^^^^^^
  File "D:\tmp\ipykernel_6104\1118306964.py", line 16, in load_data
    df = pd.read_csv(file_path)
         ^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\pandas\io\parsers\readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\pandas\io\parsers\readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Anaconda\Lib\site-packages\pandas\io\parsers\readers.py", line 1880, in _make_engine
    self.handles = get_handle(
      