In [16]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer, QuantileTransformer, Normalizer, LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [32]:
df = pd.read_csv(r'churn.csv')

# Fix gaps and type errors in data
df['TotalCharges'].replace(" ", 0, inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(float)

# One-hot encode categorical data
df_encoded = pd.get_dummies(df.drop(['customerID'],axis=1))

# Compile feature matrix and response variable
X = df_encoded.drop(['Churn_Yes', 'Churn_No'],axis=1)
y = df_encoded.Churn_Yes

categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(exclude='object').columns
transformed_cols = pd.DataFrame()

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [34]:
def label_binarize_categorical(df):
    lb = LabelBinarizer()
    categorical_cols = df.select_dtypes(include='object').columns
    transformed_cols = pd.DataFrame()
    
    for col in categorical_cols:
        transformed = lb.fit_transform(df[col])
        transformed_df = pd.DataFrame(transformed, columns=[f"{col}_{c}" for c in lb.classes_])
        transformed_cols = pd.concat([transformed_cols, transformed_df], axis=1)
    
    numerical_df = df[numerical_cols].reset_index(drop=True)
    
    transformed_df = pd.concat([numerical_df, transformed_cols], axis=1)
    return transformed_df

In [38]:
X_train_ = label_binarize_categorical(X_train)
X_test_ = label_binarize_categorical(X_test)

# Define the transformers for numerical and categorical features
# Define a list of feature scaling methods to test
scaling_methods = [
    ('No Scaler', None),
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler()),
    ('MaxAbsScaler', MaxAbsScaler()),
    ('PowerTransformer', PowerTransformer()),
    ('QuantileTransformer', QuantileTransformer()),
    ('Normalizer', Normalizer())
]

# Create an empty DataFrame to store the scores
scores_df = pd.DataFrame(columns=['Method', 'Accuracy', 'Specificity', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'])

# Test each scaling method
for method, scaler in scaling_methods:
    
    if scaler is None:
        X_train_scaled, X_test_scaled = X_train, X_test
    else:
        # Fit and transform the training data
        X_train_scaled = scaler.fit_transform(X_train)
        
        # Transform the testing data using the fitted scaler
        X_test_scaled = scaler.transform(X_test)
    
    # Train a classifier (Gradient Boosting Classifier in this example) on the scaled training data
    clf = GradientBoostingClassifier()
    clf.fit(X_train_scaled, y_train)
    
    # Make predictions on the scaled testing data
    y_pred = clf.predict(X_test_scaled)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    # Calculate specificity (true negative rate)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    
    scores = pd.DataFrame({
        'Method': [method],
        'Accuracy': [accuracy],
        'Specificity': [specificity],
        'Precision': [precision],
        'Recall': [recall],
        'F1 Score': [f1],
        'ROC AUC': [roc_auc]
    })
    
    # Concatenate the current scores with the overall scores DataFrame
    scores_df = pd.concat([scores_df, scores], ignore_index=True)

# Print the scores DataFrame
display(scores_df)


Unnamed: 0,Method,Accuracy,Specificity,Precision,Recall,F1 Score,ROC AUC
0,No Scaler,0.809084,0.90444,0.672185,0.544236,0.601481,0.724338
1,StandardScaler,0.809794,0.90444,0.673267,0.546917,0.60355,0.725679
2,MinMaxScaler,0.809084,0.90444,0.672185,0.544236,0.601481,0.724338
3,RobustScaler,0.809084,0.90444,0.672185,0.544236,0.601481,0.724338
4,MaxAbsScaler,0.809794,0.90444,0.673267,0.546917,0.60355,0.725679
5,PowerTransformer,0.809084,0.90444,0.672185,0.544236,0.601481,0.724338
6,QuantileTransformer,0.809794,0.90444,0.673267,0.546917,0.60355,0.725679
7,Normalizer,0.801987,0.901544,0.657718,0.525469,0.584203,0.713507
