In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px 
import plotly.graph_objects as go
import nbformat
from plotly.subplots import make_subplots
import six  
import numpy as np
from numpy import random as rand 
import plotly.io as pio
pio.renderers.default = "notebook_connected"
import plotly.figure_factory as ff


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import RobustScaler, PowerTransformer, FunctionTransformer, KBinsDiscretizer, OrdinalEncoder, OneHotEncoder, LabelEncoder,StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Predictive Modeling

### Baseline with Satisfaction Score

Load Data

In [5]:
# Load the CSV file into a DataFrame
df_model = pd.read_csv("Telco_data/df_model.csv",  sep=",")
# Display the DataFrame 
print(df_model.head())

  Customer ID Referred a Friend  Number of Referrals  Tenure in Months  \
0  8779-QRDMV                No                    0                 1   
1  7495-OOKFY               Yes                    1                 8   
2  1658-BYGOY                No                    0                18   
3  4598-XLKNJ               Yes                    1                25   
4  4846-WHAFZ               Yes                    1                37   

     Offer Phone Service  Avg Monthly Long Distance Charges Multiple Lines  \
0      NaN            No                               0.00             No   
1  Offer E           Yes                              48.85            Yes   
2  Offer D           Yes                              11.33            Yes   
3  Offer C           Yes                              19.76             No   
4  Offer C           Yes                               6.33            Yes   

  Internet Service Internet Type  ...  Total Revenue Churn Value  \
0              Yes

In [7]:
df_model.dtypes

Customer ID                           object
Referred a Friend                     object
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                 object
Phone Service                         object
Avg Monthly Long Distance Charges    float64
Multiple Lines                        object
Internet Service                      object
Internet Type                         object
Avg Monthly GB Download                int64
Online Security                       object
Online Backup                         object
Device Protection Plan                object
Premium Tech Support                  object
Streaming TV                          object
Streaming Movies                      object
Streaming Music                       object
Unlimited Data                        object
Contract                              object
Paperless Billing                     object
Payment Method                        object
Monthly Ch

Preprocess

In [None]:
# Identify and convert object type features to categorical
object_features = df_model.select_dtypes(include=['object']).columns.tolist()
for col in object_features:
    df_model[col] = df_model[col].astype('category')

# Verify the conversion
print(df_model.dtypes)

In [None]:
# Ensure 'Satisfaction Score' is also treated as a categorical variable
df_model['Satisfaction Score'] = df_model['Satisfaction Score'].astype('category')

In [None]:
numeric_var =['Age','Number of Dependents','Number of Referrals','Tenure in Months','Avg Monthly Long Distance Charges','Avg Monthly GB Download',
              'Monthly Charge','Total Charges','Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Total Revenue'] 

categoric_var = ['Referred a Friend', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support',
'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Under 30', 'Senior Citizen', 'Married', 'Dependents', 'Gender', 'Offer']


ordinal_var = ['Satisfaction Score']


Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=3000, penalty='l2', C=0.1, random_state=42),  # Stronger L2 regularization
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,  # Limit the depth of the trees
        min_samples_split=10,  # Require more samples to split nodes
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        eval_metric='logloss',
        n_estimators=100,
        max_depth=6,  # Limit the depth of the trees
        min_child_weight=5,  # Increase the minimum child weight
        subsample=0.8,  # Use a fraction of the data for each tree
        random_state=42
    )
}

Cross-validation

In [None]:
# Split dataset
X = df_model[numeric_var + categoric_var + ordinal_var]
y = df_model['Churn Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)



# Encode categorical data to numeric, scale numeric data (without handling missing values) - gives same results
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categoric_var),
        ('num', StandardScaler(), numeric_var),
        ('ord', OrdinalEncoder(), ordinal_var)

    ]) 




# Cross-validation
cv_df = []
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    score = cross_validate(model_pipe, X_train, y_train, cv=10, return_train_score=False, scoring=['roc_auc', 'f1', 'recall', 'precision'])
    mean_score = []
    for i in score.values():
        mean_score.append(round(np.mean(i), 3))
    cv_df.append(pd.DataFrame({'Attribute': [i for i in score.keys()], f'{name}': mean_score }))

score_before = pd.DataFrame(columns=['Attribute'])
for i in cv_df:
    score_before = score_before.merge(right=i, on='Attribute', how='outer')
score_before = score_before.loc[2:].reset_index(drop=True)

print(score_before)

Fitting and Comparing test and train accuracies and errors

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc

In [None]:
# Initialize a list to store results
results = []

# Fitting the models and evaluating on the test set
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor), ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    # Predictions and performance on the training set
    y_train_pred = model_pipe.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_error = 1 - train_accuracy  # Calculate training error
    
    if hasattr(model_pipe, "predict_proba"):
        y_train_pred_proba = model_pipe.predict_proba(X_train)[:, 1]
        train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)
    else:
        train_roc_auc = None

    # Predictions and performance on the test set
    y_test_pred = model_pipe.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_error = 1 - test_accuracy  # Calculate testing error
    
    if hasattr(model_pipe, "predict_proba"):
        y_test_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
        test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)
    else:
        test_roc_auc = None

    results.append({
        'Model': name,
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train Error': train_error,  # Store training error
        'Test Error': test_error,    # Store testing error
        'Train ROC AUC': train_roc_auc,
        'Test ROC AUC': test_roc_auc
    })
    
    print(f"Classification report for {name} on test set:")
    print(classification_report(y_test, y_test_pred))
    print("="*60)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
print(results_df)

Learning curves

In [None]:
# Define a function to plot the learning curve with a specific scoring metric
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), scoring=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel(scoring if scoring else "Score")  # Display the scoring metric used
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    return plt


In [None]:
# Plot learning curves for different metrics
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for metric in metrics:
    # Plot learning curves for each model with scoring='recall'
    for name, model in models.items():
        model_pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        plot_learning_curve(model_pipe, f'Learning Curve for {name} ({metric})', X_train, y_train, cv=5)
        plt.show()

Metrics


In [None]:
# Convert cross-validation results to DataFrame
cv_df_long = pd.melt(score_before, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Plot performance metrics comparison
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=cv_df_long)
plt.title('Model Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

ROC AUC

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor), ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

Feature Importance

In [None]:
# Iterate over models and plot feature importance
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor), ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    if hasattr(model, 'feature_importances_'):
        # Tree-based models
        feature_importances = model_pipe.named_steps['model'].feature_importances_
    elif isinstance(model, LogisticRegression):
        # Logistic Regression
        feature_importances = np.abs(model_pipe.named_steps['model'].coef_[0])
    else:
        # Skip models that don't have feature importances or coefficients
        continue
    
    # Get feature names
    cat_feature_names = model_pipe.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categoric_var)
    ord_feature_names = ordinal_var  # Ordinal features retain their original names
    feature_names = numeric_var + list(cat_feature_names) + ord_feature_names
    
    # Create DataFrame for feature importance
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Feature Importance for {name}')
    plt.show()

### Baseline without Satisfaction Score

In [None]:
numeric_var =['Age','Number of Dependents','Number of Referrals','Tenure in Months','Avg Monthly Long Distance Charges','Avg Monthly GB Download',
              'Monthly Charge','Total Charges','Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Total Revenue'] 


categoric_var = ['Referred a Friend', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support',
'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Under 30', 'Senior Citizen', 'Married', 'Dependents', 'Gender', 'Offer']



Cross-validation

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=3000, penalty='l2', C=0.1, random_state=42),  # Stronger L2 regularization
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,  # Limit the depth of the trees
        min_samples_split=10,  # Require more samples to split nodes
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        eval_metric='logloss',
        n_estimators=100,
        max_depth=6,  # Limit the depth of the trees
        min_child_weight=5,  # Increase the minimum child weight
        subsample=0.8,  # Use a fraction of the data for each tree
        random_state=42
    )
}

# Split dataset
X = df_model[numeric_var + categoric_var]
y = df_model['Churn Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)



# Encode categorical data to numeric, scale numeric data (without handling missing values) - gives same results
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categoric_var),
        ('num', StandardScaler(), numeric_var)
        

    ]) 




# Cross-validation
cv_df = []
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    score = cross_validate(model_pipe, X_train, y_train, cv=10, return_train_score=False, scoring=['roc_auc', 'f1', 'recall', 'precision'])
    mean_score = []
    for i in score.values():
        mean_score.append(round(np.mean(i), 3))
    cv_df.append(pd.DataFrame({'Attribute': [i for i in score.keys()], f'{name}': mean_score }))

score_before = pd.DataFrame(columns=['Attribute'])
for i in cv_df:
    score_before = score_before.merge(right=i, on='Attribute', how='outer')
score_before = score_before.loc[2:].reset_index(drop=True)

print(score_before)

Model Fit on Test set & Comparing test and train accuracies

In [None]:
# Initialize a list to store results
results = []

# Fitting the models and evaluating on the test set
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor), ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    # Predictions and performance on the training set
    y_train_pred = model_pipe.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    if hasattr(model_pipe, "predict_proba"):
        y_train_pred_proba = model_pipe.predict_proba(X_train)[:, 1]
        train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)
    else:
        train_roc_auc = None

    # Predictions and performance on the test set
    y_test_pred = model_pipe.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    if hasattr(model_pipe, "predict_proba"):
        y_test_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
        test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)
    else:
        test_roc_auc = None

    results.append({
        'Model': name,
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train ROC AUC': train_roc_auc,
        'Test ROC AUC': test_roc_auc
    })
    
    print(f"Classification report for {name} on test set:")
    print(classification_report(y_test, y_test_pred))
    print("="*60)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
print(results_df)


Learning Curves


In [None]:
 
# Define a function to plot the learning curve with a specific scoring metric
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), scoring=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel(scoring if scoring else "Score")  # Display the scoring metric used
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    return plt


# Plot learning curves for different metrics
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for metric in metrics:
# Plot learning curves for each model with scoring='recall'
    for name, model in models.items():
        model_pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
        ('model', model)
        ])
        plot_learning_curve(model_pipe, f'Learning Curve for {name} ({metric})', X_train, y_train, cv=5, scoring='recall')
        plt.show()

Metrics

In [None]:
# Convert cross-validation results to DataFrame
cv_df_long = pd.melt(score_before, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Plot performance metrics comparison
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=cv_df_long)
plt.title('Model Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

ROC AUC

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor), ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

Feature importance for tree-based models and logistic regression



In [None]:
# Iterate over models and plot feature importance
for name, model in models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor), ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    if hasattr(model, 'feature_importances_'):
        # Tree-based models
        feature_importances = model_pipe.named_steps['model'].feature_importances_
    elif isinstance(model, LogisticRegression):
        # Logistic Regression
        feature_importances = np.abs(model_pipe.named_steps['model'].coef_[0])
    else:
        # Skip models that don't have feature importances or coefficients
        continue
    
    # Get feature names
    cat_feature_names = model_pipe.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categoric_var)
    feature_names = numeric_var + list(cat_feature_names)
    
    # Create DataFrame for feature importance
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Feature Importance for {name}')
    plt.show()

## Feature Selection

#### Extra Data Preprocessing 


In [None]:
from sklearn.feature_selection import SelectFromModel
from scipy.stats import skew, shapiro, normaltest

In [None]:
# Apply the encoding switch to df_model
#  ---> from ['Yes', 'No'] to  ['No', 'Yes']

columns_to_switch = ['Internet Service', 'Device Protection Plan', 'Streaming Movies', 'Paperless Billing', 'Senior Citizen']


for col in columns_to_switch:
    df_model[col] = pd.Categorical(df_model[col], categories=['No', 'Yes'], ordered=True)


# Define the categorical and numerical variables
binary_var = ['Referred a Friend', 'Phone Service', 'Multiple Lines', 'Online Security', 'Online Backup', 
              'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 
              'Streaming Music', 'Unlimited Data', 'Paperless Billing', 'Under 30', 'Senior Citizen', 
              'Married', 'Dependents','Offer', 'Payment Method', 'Gender', 'Contract']


# Verify the order and print unique values
for col in binary_var:
    print('Variable :', col)
    print('Unique Values :', df_model[col].unique())
    print('Number of Unique Values :', df_model[col].nunique())
    print('\n')

##### Dealing with outliers

In [None]:
# function to find outliers
def find_outliers_IQR(Series):
   q1=Series.quantile(0.25)
   q3=Series.quantile(0.75)
   IQR=q3-q1 
   upper_bound = (q3+1.5*IQR) 
   lower_bound = (q1-1.5*IQR)
   outliers = Series[(Series<lower_bound) | (Series>upper_bound)]
   return outliers, lower_bound, upper_bound 

# plot data distributions
plt.figure(figsize=(20,10))
for i in enumerate(numeric_var):
  plt.subplot(5,4,i[0]+1)
  sns.histplot(df_model[i[1]], kde=True,color='green')
plt.tight_layout()
plt.show()

In [None]:
num_conc = pd.DataFrame({
    'Numeric Var':numeric_var,
    'Skewness':[ round(skew(df_model[i], bias=False),3) for i in numeric_var],
    'Shapiro P-val': [ round(shapiro(df_model[i].sample(3000))[1],5) for i in numeric_var] ,
    'Conclusion': [ 'Normal' if shapiro(df_model[i].sample(3000))[1] >= 0.05 else 'Non-Normal' for i in numeric_var],
    'Outlier Count': [find_outliers_IQR(df_model[i])[0].shape[0] for i in numeric_var],
    'Outlier %' : [ round((i / df_model.shape[0])*100,2) for i in [find_outliers_IQR(df_model[i])[0].shape[0] for i in numeric_var]]  
}).sort_values(by='Skewness', ignore_index=True)
display(num_conc,df_model[numeric_var].describe())

Outliers: 'Number of Dependents', 'Total Extra Data Charges', 'Total Refunds' , 'Number of Referrals', 'Total Long Distance Charges','Avg Monthly GB Download','Total Revenue'

The following outlier operations will be carried out:
- For the feature 'Number of Dependents' all numbers above 3 will be changed to number 3.
- For the features 'Total Extra Data Charges' and 'Total Refunds' they will be minimized and changed to categorical data: all numbers 0 will be changed to 'No', the rest to 'Yes'.
- For other features the outlier will be considered as missing values ​​which will later be compiled using KNN Imputer.




In [None]:
# Spliting outlier variables
out_var1 = ['Total Refunds','Total Extra Data Charges'] 
out_var2 = ['Number of Dependents']
out_var3 = ['Number of Referrals','Total Long Distance Charges','Avg Monthly GB Download','Total Revenue']

##### Encoding to categorical

In [None]:
# Calculate the proportion of the values 0
for i in ['Total Extra Data Charges','Total Refunds']: 
    print( "{} == 0 :".format(i),df_model[i].value_counts().head(1)[0], '| {persen} %'.format(persen=round((df_model[i].value_counts().head(1)[0]/df_model.shape[0])*100,3)))
    print("{} != 0 :".format(i), df_model[df_model[i] !=0].shape[0], '| {persen} %'.format(persen=round((df_model[df_model[i] !=0].shape[0] / df_model.shape[0]) * 100,3)))
    print('\n') 

In [None]:
df_transform = df_model

In [None]:
# Recode the fetures with outliers to categorical (Yes and No categories)
df_transform['Total Extra Data Charges'] = df_transform['Total Extra Data Charges'].apply(lambda x: 'No' if x == 0 else 'Yes')
df_transform['Total Refunds'] = df_transform['Total Refunds'].apply(lambda x: 'No' if x == 0 else 'Yes')

In [None]:
# Chech the encoding worked
print(df_transform['Total Extra Data Charges'])

In [None]:
# Verify the order and print unique values
binary_variables = ['Total Extra Data Charges','Total Refunds']

for col in binary_variables:
    print('Variable :', col)
    print('Unique Values :', df_transform[col].unique())
    print('Number of Unique Values :', df_transform[col].nunique())
    print('\n')

In [None]:
# Apply the encoding switch to df_model
#  ---> from ['Yes', 'No'] to  ['No', 'Yes']

columns_to_switch_2 = ['Total Extra Data Charges']


for col in columns_to_switch_2:
    df_transform[col] = pd.Categorical(df_transform[col], categories=['No', 'Yes'], ordered=True)

In [None]:
binary_variables = ['Total Extra Data Charges','Total Refunds']

# Verify the order and print unique values

for col in binary_variables:
    print('Variable :', col)
    print('Unique Values :', df_transform[col].unique())
    print('Number of Unique Values :', df_transform[col].nunique())
    print('\n')

##### IQR method

Caps the values of 'Number of Dependents' at 3 

Identifies outliers in specified columns using the IQR method and replaces values above the upper bound with NaN. 

Imputes the NaN values using the KNN imputer. 

In [None]:
from sklearn.impute import KNNImputer, SimpleImputer

In [None]:
# function to find outliers
def find_outliers_IQR(Series):
   q1=Series.quantile(0.25)
   q3=Series.quantile(0.75)
   IQR=q3-q1 
   upper_bound = (q3+1.5*IQR) 
   lower_bound = (q1-1.5*IQR)
   outliers = Series[(Series<lower_bound) | (Series>upper_bound)]
   return outliers, lower_bound, upper_bound 

# function handling outliers
def func_handling_outlier(X_train):
    # handling variable 2 outliers
    X_train['Number of Dependents'] = X_train['Number of Dependents'].apply(lambda i: 3 if i>3 else i) 
    # handling variable 3 outliers
    out_var3 = ['Number of Referrals','Total Long Distance Charges','Avg Monthly GB Download','Total Revenue']
    # upper bound
    upper_bound = []
    for i in out_var3:
        upper_bound.append( int(find_outliers_IQR(df_transform[i])[2]) ) 
    #  replace values that are above the upper bound with NaN
    for i,j in zip(out_var3,upper_bound):    
        X_train.loc[X_train[X_train[i]>j].index.to_list(),i] = np.NaN
    # impute the NaN values using the K-Nearest Neighbors (KNN) imputer
    imputer = KNNImputer(n_neighbors=7) 
    X_train[out_var3] = imputer.fit_transform(X_train[out_var3])
    X_train[out_var3] = X_train[out_var3]
    return X_train[numeric_var + ordinal_var + nominal_var]


# pipeline for handling outlier
hand_outliers = ColumnTransformer([
    ('handling_outlier', FunctionTransformer(func_handling_outlier),numeric_var + ordinal_var + nominal_var)
    ]) 

### Feature selection without Satisfaction Score

In [None]:
# Define the order for ordinal variables
## excluded variables: 'Churn Value', 'Satisfaction Score', 'Customer ID', Adding 'Total Extra Data Charges' and 'Total Refunds' as categorical
ord_ = {
    'ord1': [['Married', 'Dependents', 'Referred a Friend', 'Phone Service', 'Premium Tech Support', 'Streaming Music', 'Unlimited Data', 'Under 30', 'Online Security',
             'Online Backup','Streaming TV', 'Multiple Lines', 'Internet Service','Device Protection Plan', 'Streaming Movies', 'Paperless Billing', 'Senior Citizen',
             'Total Extra Data Charges','Total Refunds'], ['No', 'Yes']],
    'ord2': [['Internet Type'], ['None', 'DSL', 'Cable', 'Fiber Optic']], # MAKE SURE THE SPEED OF THE INTERNET TYPE INCREASING
    'ord3': [['Contract'], ['Month-to-Month', 'One Year', 'Two Year']]
    
}


nominal_var = ['Offer', 'Gender', 'Payment Method']


numeric_var = ['Number of Referrals', 'Tenure in Months', 'Avg Monthly Long Distance Charges', 
               'Avg Monthly GB Download', 'Monthly Charge', 'Total Charges', 'Total Long Distance Charges', 
               'Total Revenue', 'Age', 'Number of Dependents']

In [None]:
# Feature selection models
feature_selection_models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,  # Limit the depth of the trees
        min_samples_split=10,  # Require more samples to split nodes
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        eval_metric='logloss',
        n_estimators=100,
        max_depth=6,  # Limit the depth of the trees
        min_child_weight=5,  # Increase the minimum child weight
        subsample=0.8,  # Use a fraction of the data for each tree
        random_state=42
    )
}


Model Fit, Feature Importance and Cross Validation

In [None]:
# Combine all ordinal variables into one list and their respective order categories
ordinal_var = []
ord_categories = []
for key in ord_:
    ordinal_var += ord_[key][0]
    ord_categories += [ord_[key][1]] * len(ord_[key][0])


# Define function to find outliers
def find_outliers_IQR(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    IQR = q3 - q1
    upper_bound = q3 + 1.5 * IQR
    lower_bound = q1 - 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return outliers, lower_bound, upper_bound

# Function for handling outliers
def func_handling_outlier(X):
    X = X.copy()
    X['Number of Dependents'] = X['Number of Dependents'].apply(lambda i: 3 if i > 3 else i)
    out_var3 = ['Number of Referrals', 'Total Long Distance Charges', 'Avg Monthly GB Download', 'Total Revenue']
    upper_bound = []
    for i in out_var3:
        upper_bound.append(int(find_outliers_IQR(X[i])[2]))
    for i, j in zip(out_var3, upper_bound):
        X.loc[X[X[i] > j].index.to_list(), i] = np.NaN
    imputer = KNNImputer(n_neighbors=7)
    X[out_var3] = imputer.fit_transform(X[out_var3])
    return X

# Split dataset
X = df_transform[numeric_var + nominal_var + ordinal_var]
y = df_transform['Churn Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Ensure data is in DataFrame format
X_train = pd.DataFrame(X_train, columns=numeric_var + nominal_var + ordinal_var)
X_test = pd.DataFrame(X_test, columns=numeric_var + nominal_var + ordinal_var)

# Preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline(steps=[
    ('encoder', OrdinalEncoder(categories=ord_categories))
])

# Column transformer for preprocessing
prep_var = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numeric_var),
        ('nominal', nominal_pipeline, nominal_var),
        ('ordinal', ordinal_pipeline, ordinal_var)
    ], remainder='passthrough'
)

# Pipeline for handling outliers
hand_outliers = Pipeline(steps=[
    ('handling_outlier', FunctionTransformer(func_handling_outlier))
])

# Combined preprocessor
combined_preprocessor = Pipeline(steps=[
    ('hand_outliers', hand_outliers),
    ('prep_var', prep_var)
])

# Fit the combined preprocessor first
combined_preprocessor.fit(X_train)

# Extract feature names after fitting the combined preprocessor
nominal_feature_names = combined_preprocessor.named_steps['prep_var'].named_transformers_['nominal'].named_steps['encoder'].get_feature_names_out(nominal_var)
feature_names = numeric_var + list(nominal_feature_names) + ordinal_var

# Feature selection models
feature_selection_models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200,  # Increased number of trees
        max_depth=10,  # No limit on the depth of the trees
        min_samples_split=10,  # Lowered to allow more splits
        random_state=42
)
,
    'XGBoost': XGBClassifier(
        eval_metric='logloss',
        n_estimators=200,  # Increased number of boosting rounds
        max_depth=6,  # Keep the same depth
        min_child_weight=5,  # Reduced to allow more splits
        subsample=0.8,  # Keep the same subsample ratio
        learning_rate=0.1,  # Introduce a learning rate
        random_state=42
    )
}




cv_df_fs = []
for name, model in feature_selection_models.items():
    feature_selector = SelectFromModel(model, threshold='mean')
    
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)
    ])
    
    # Perform cross-validation
    score = cross_validate(model_pipe, X_train, y_train, cv=10, return_train_score=False, scoring=['roc_auc', 'f1', 'recall', 'precision'])
    mean_score = []
    for i in score.values():
        mean_score.append(round(np.mean(i), 3))
    cv_df_fs.append(pd.DataFrame({'Attribute': [i for i in score.keys()], f'{name}': mean_score}))

score_fs = pd.DataFrame(columns=['Attribute'])
for i in cv_df_fs:
    score_fs = score_fs.merge(right=i, on='Attribute', how='outer')
score_fs = score_fs.loc[2:].reset_index(drop=True)

print("Feature Selection Model Cross-Validation Results:")
print(score_fs)



model_pipes = {}
# Fitting the feature selection models and evaluating on the test set
for name, model in feature_selection_models.items():
    feature_selector = SelectFromModel(model, threshold='mean')
    
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)
    ])
    
    model_pipe.fit(X_train, y_train)
    model_pipes[name] = model_pipe
    
    # Extract selected feature names
    selected_mask = model_pipe.named_steps['feature_selection'].get_support()
    selected_features = [feature for feature, selected in zip(feature_names, selected_mask) if selected]

    print(f"Selected Features for {name}: {selected_features}")

    # Transform the training and testing sets using the entire pipeline
    X_train_transformed = model_pipe.named_steps['preprocessor'].transform(X_train)
    X_test_transformed = model_pipe.named_steps['preprocessor'].transform(X_test)

    # Transform using the feature selector
    X_train_selected = model_pipe.named_steps['feature_selection'].transform(X_train_transformed)
    X_test_selected = model_pipe.named_steps['feature_selection'].transform(X_test_transformed)

    # Create a new model pipeline with only the selected features
    final_model_pipe = Pipeline(steps=[
        ('model', model)
    ])

    # Fit the final model on the selected features
    final_model_pipe.fit(X_train_selected, y_train)

    # Evaluate the final model on the test set
    y_pred = final_model_pipe.predict(X_test_selected)
    print(f"Classification report for {name}:")
    print(classification_report(y_test, y_pred))

    # Plot feature importances of the selected features
    importances = model_pipe.named_steps['feature_selection'].estimator_.feature_importances_
    selected_importances = [importance for importance, selected in zip(importances, selected_mask) if selected]
    importance_df = pd.DataFrame({'Feature': selected_features, 'Importance': selected_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Feature Importance for {name}')
    plt.show()

Learning curves

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
 
# Define a function to plot the learning curve with a specific scoring metric
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), scoring=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel(scoring if scoring else "Score")  # Display the scoring metric used
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    return plt

# Plot learning curves for each model with scoring='recall'
for name, model in feature_selection_models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)
    ])
    plot_learning_curve(model_pipe, f'Learning Curve for {name}', X_train, y_train, cv=5, scoring='recall')
    plt.show()


Model metrics


In [None]:
# Convert cross-validation results to DataFrame
cv_df_long = pd.melt(score_fs, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Plot performance metrics comparison
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=cv_df_long)
plt.title('Model Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

ROC AUC curves

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))
for name, model in feature_selection_models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

#### Logistic regression

In [None]:
from sklearn.feature_selection import RFE

In [None]:
# Combine all ordinal variables into one list and their respective order categories
ordinal_var = []
ord_categories = []
for key in ord_:
    ordinal_var += ord_[key][0]
    ord_categories += [ord_[key][1]] * len(ord_[key][0])

# Define function to find outliers
def find_outliers_IQR(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    IQR = q3 - q1
    upper_bound = q3 + 1.5 * IQR
    lower_bound = q1 - 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return outliers, lower_bound, upper_bound

# Function for handling outliers
def func_handling_outlier(X):
    X = X.copy()
    X['Number of Dependents'] = X['Number of Dependents'].apply(lambda i: 3 if i > 3 else i)
    out_var3 = ['Number of Referrals', 'Total Long Distance Charges', 'Avg Monthly GB Download', 'Total Revenue']
    upper_bound = []
    for i in out_var3:
        upper_bound.append(int(find_outliers_IQR(X[i])[2]))
    for i, j in zip(out_var3, upper_bound):
        X.loc[X[X[i] > j].index.to_list(), i] = np.NaN
    imputer = KNNImputer(n_neighbors=7)
    X[out_var3] = imputer.fit_transform(X[out_var3])
    return X

# Split dataset
X = df_transform[numeric_var + nominal_var + ordinal_var]
y = df_transform['Churn Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Ensure data is in DataFrame format
X_train = pd.DataFrame(X_train, columns=numeric_var + nominal_var + ordinal_var)
X_test = pd.DataFrame(X_test, columns=numeric_var + nominal_var + ordinal_var)

# Preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ord_categories))
])

# Column transformer for preprocessing
prep_var = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numeric_var),
        ('nominal', nominal_pipeline, nominal_var),
        ('ordinal', ordinal_pipeline, ordinal_var)
    ], remainder='passthrough'
)

# Pipeline for handling outliers
hand_outliers = Pipeline(steps=[
    ('handling_outlier', FunctionTransformer(func_handling_outlier))
])

# Combined preprocessor
combined_preprocessor = Pipeline(steps=[
    ('hand_outliers', hand_outliers),
    ('prep_var', prep_var)
])

# Fit the combined preprocessor first
combined_preprocessor.fit(X_train)

# Extract feature names after fitting the combined preprocessor
nominal_feature_names = combined_preprocessor.named_steps['prep_var'].named_transformers_['nominal'].named_steps['encoder'].get_feature_names_out(nominal_var)
feature_names = numeric_var + list(nominal_feature_names) + ordinal_var

# Feature selection with Logistic Regression using RFE
model = LogisticRegression(max_iter=1000, random_state=42)
feature_selector = RFE(model, n_features_to_select=10)  # Select top 10 features

model_pipe = Pipeline(steps=[
    ('preprocessor', combined_preprocessor),
    ('feature_selection', feature_selector),
    ('model', model)
])

cv_df_fs_LR = []

# Perform cross-validation
score = cross_validate(model_pipe, X_train, y_train, cv=10, return_train_score=False, scoring=['roc_auc', 'f1', 'recall', 'precision'])

# Convert cross-validation results to DataFrame
cv_df_fs_LR = pd.DataFrame({
    'Attribute': [i for i in score.keys()],
    'Logistic Regression': [round(np.mean(score[i]), 3) for i in score.keys()]
})

print("Feature Selection Model Cross-Validation Results:")
print(cv_df_fs_LR)

# Directly assign to score_fs_LR
score_fs_LR = cv_df_fs_LR.loc[2:].reset_index(drop=True)

# Fitting the feature selection model and evaluating on the test set
model_pipe.fit(X_train, y_train)

# Extract selected feature names
selected_mask = model_pipe.named_steps['feature_selection'].get_support()
selected_features = [feature for feature, selected in zip(feature_names, selected_mask) if selected]

print(f"Selected Features for Logistic Regression: {selected_features}")

# Transform the training and testing sets using the entire pipeline
X_train_transformed = model_pipe.named_steps['preprocessor'].transform(X_train)
X_test_transformed = model_pipe.named_steps['preprocessor'].transform(X_test)

# Transform using the feature selector
X_train_selected = model_pipe.named_steps['feature_selection'].transform(X_train_transformed)
X_test_selected = model_pipe.named_steps['feature_selection'].transform(X_test_transformed)

# Create a new model pipeline with only the selected features
final_model_pipe = Pipeline(steps=[
    ('model', model)
])

# Fit the final model on the selected features
final_model_pipe.fit(X_train_selected, y_train)

# Evaluate the final model on the test set
y_pred = final_model_pipe.predict(X_test_selected)
print(f"Classification report for Logistic Regression:")
print(classification_report(y_test, y_pred))

Learning curves

In [None]:
# Step 4: Plot learning curves for different metrics
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for metric in metrics:
    plot_learning_curve(model_pipe, f'Learning Curve for Logistic Regression ({metric})', X_train, y_train, cv=5, scoring=metric)
    plt.show()

Feature importance

In [None]:
# Visualize feature importance
coefficients = final_model_pipe.named_steps['model'].coef_[0]
importance_df = pd.DataFrame({'Feature': selected_features, 'Coefficient': coefficients})
importance_df = importance_df.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=importance_df)
plt.title('Feature Importance for Logistic Regression')
plt.show()


Model metrics

In [None]:
# Convert the cross-validation results to long format
cv_df_long_LR = pd.melt(cv_df_fs_LR, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Define the desired order of metrics
desired_order = ['test_f1', 'test_precision', 'test_recall', 'test_roc_auc']

# Convert the 'Attribute' column to a categorical type with the specified order
cv_df_long_LR['Attribute'] = pd.Categorical(cv_df_long_LR['Attribute'], categories=desired_order, ordered=True)

# Plot performance metrics comparison
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=cv_df_long_LR)
plt.title('Logistic Regression Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

Combined Metrics


In [None]:
# Convert Logistic Regression cross-validation results to long format
cv_df_long_LR = pd.melt(cv_df_fs_LR, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Convert other models' cross-validation results to long format
cv_df_long_other = pd.melt(score_fs, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Ensure there are no duplicate entries
cv_df_long_LR = cv_df_long_LR.drop_duplicates(subset=['Attribute', 'Model'])
cv_df_long_other = cv_df_long_other.drop_duplicates(subset=['Attribute', 'Model'])

# Combine the two DataFrames
combined_df_long = pd.concat([cv_df_long_LR, cv_df_long_other], axis=0)

# Reset index to ensure no duplicate indices
combined_df_long = combined_df_long.reset_index(drop=True)

# Define the desired order of metrics
desired_order = ['test_f1', 'test_precision', 'test_recall', 'test_roc_auc']

# Convert the 'Attribute' column to a categorical type with the specified order
combined_df_long['Attribute'] = pd.Categorical(combined_df_long['Attribute'], categories=desired_order, ordered=True)

# Plot performance metrics comparison for all models
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=combined_df_long)
plt.title('Model Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

ROC AUC curve

In [None]:
# Plot ROC curve for Logistic Regression
plt.figure(figsize=(10, 8))

# Define the Logistic Regression pipeline
model_pipe = Pipeline(steps=[
    ('preprocessor', combined_preprocessor),
    ('feature_selection', feature_selector),
    ('model', model)
])

# Fit the pipeline on the training data
model_pipe.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.plot(fpr, tpr, lw=2, label=f'Logistic Regression (AUC = {roc_auc:.2f})')

# Plot the diagonal line representing random guessing
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# Set plot limits and labels
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")

# Display the plot
plt.show()

Combined ROC AUC curves

In [None]:
# Initialize the figure
plt.figure(figsize=(10, 8))

# Plot ROC curves for other models in feature_selection_models
for name, model in feature_selection_models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

# Plot ROC curve for Logistic Regression
# Define the Logistic Regression pipeline
model_pipe_lr = Pipeline(steps=[
    ('preprocessor', combined_preprocessor),
    ('feature_selection', feature_selector),
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])

# Fit the pipeline on the training data
model_pipe_lr.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_proba_lr = model_pipe_lr.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC for Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

# Plot the ROC curve for Logistic Regression
plt.plot(fpr_lr, tpr_lr, lw=2, label=f'Logistic Regression (AUC = {roc_auc_lr:.2f})', color='red')  # Use a distinct color

# Plot the diagonal line representing random guessing
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# Set plot limits and labels
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")

# Display the plot
plt.show()

### Feature Selection with Satisfaction Score

Model Fit, Feature Importance and Cross Validation

In [None]:
# Define the order for ordinal variables
## excluded variables: 'Churn Value', 'Customer ID'
ord_ = {
    'ord1': [['Married', 'Dependents', 'Referred a Friend', 'Phone Service', 'Premium Tech Support', 'Streaming Music', 'Unlimited Data', 'Under 30', 'Online Security',
             'Online Backup','Streaming TV', 'Multiple Lines', 'Internet Service','Device Protection Plan', 'Streaming Movies', 'Paperless Billing', 'Senior Citizen', 
             'Total Extra Data Charges','Total Refunds'], ['No', 'Yes']],
    'ord2': [['Internet Type'], ['None', 'DSL', 'Cable', 'Fiber Optic']], # MAKE SURE THE SPEED OF THE INTERNET TYPE INCREASING
    'ord3': [['Contract'], ['Month-to-Month', 'One Year', 'Two Year']],
    'ord4': [['Satisfaction Score'], ['1', '2', '3', '4', '5']]
    
}


nominal_var = ['Offer', 'Gender', 'Payment Method']


numeric_var = ['Number of Referrals', 'Tenure in Months', 'Avg Monthly Long Distance Charges', 
               'Avg Monthly GB Download', 'Monthly Charge', 'Total Charges', 'Total Long Distance Charges', 'Total Revenue', 'Age', 'Number of Dependents']

In [None]:
# Ensure 'Satisfaction Score' is also treated as a categorical variable
df_transform['Satisfaction Score'] = df_transform['Satisfaction Score'].astype('category')

In [None]:
# Combine all ordinal variables into one list and their respective order categories
ordinal_var = []
ord_categories = []
for key in ord_:
    ordinal_var += ord_[key][0]
    ord_categories += [ord_[key][1]] * len(ord_[key][0])


# Define function to find outliers
def find_outliers_IQR(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    IQR = q3 - q1
    upper_bound = q3 + 1.5 * IQR
    lower_bound = q1 - 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return outliers, lower_bound, upper_bound

# Function for handling outliers
def func_handling_outlier(X):
    X = X.copy()
    X['Number of Dependents'] = X['Number of Dependents'].apply(lambda i: 3 if i > 3 else i)
    out_var3 = ['Number of Referrals', 'Total Long Distance Charges', 'Avg Monthly GB Download', 'Total Revenue']
    upper_bound = []
    for i in out_var3:
        upper_bound.append(int(find_outliers_IQR(X[i])[2]))
    for i, j in zip(out_var3, upper_bound):
        X.loc[X[X[i] > j].index.to_list(), i] = np.NaN
    imputer = KNNImputer(n_neighbors=7)
    X[out_var3] = imputer.fit_transform(X[out_var3])
    return X

# Split dataset
X = df_transform[numeric_var + nominal_var + ordinal_var]
y = df_transform['Churn Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Ensure data is in DataFrame format
X_train = pd.DataFrame(X_train, columns=numeric_var + nominal_var + ordinal_var)
X_test = pd.DataFrame(X_test, columns=numeric_var + nominal_var + ordinal_var)

# Preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline(steps=[
    ('encoder', OrdinalEncoder(categories=ord_categories))
])

# Column transformer for preprocessing
prep_var = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numeric_var),
        ('nominal', nominal_pipeline, nominal_var),
        ('ordinal', ordinal_pipeline, ordinal_var)
    ], remainder='passthrough'
)

# Pipeline for handling outliers
hand_outliers = Pipeline(steps=[
    ('handling_outlier', FunctionTransformer(func_handling_outlier))
])

# Combined preprocessor
combined_preprocessor = Pipeline(steps=[
    ('hand_outliers', hand_outliers),
    ('prep_var', prep_var)
])

# Fit the combined preprocessor first
combined_preprocessor.fit(X_train)

# Extract feature names after fitting the combined preprocessor
nominal_feature_names = combined_preprocessor.named_steps['prep_var'].named_transformers_['nominal'].named_steps['encoder'].get_feature_names_out(nominal_var)
feature_names = numeric_var + list(nominal_feature_names) + ordinal_var

# Feature selection models
feature_selection_models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200,  # Increased number of trees
        max_depth=10,  # No limit on the depth of the trees
        min_samples_split=10,  # Lowered to allow more splits
        random_state=42
)
,
    'XGBoost': XGBClassifier(
        eval_metric='logloss',
        n_estimators=200,  # Increased number of boosting rounds
        max_depth=6,  # Keep the same depth
        min_child_weight=5,  # Reduced to allow more splits
        subsample=0.8,  # Keep the same subsample ratio
        learning_rate=0.1,  # Introduce a learning rate
        random_state=42
    )
}




cv_df_fs = []
for name, model in feature_selection_models.items():
    feature_selector = SelectFromModel(model, threshold='mean')
    
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)
    ])
    
    # Perform cross-validation
    score = cross_validate(model_pipe, X_train, y_train, cv=10, return_train_score=False, scoring=['roc_auc', 'f1', 'recall', 'precision'])
    mean_score = []
    for i in score.values():
        mean_score.append(round(np.mean(i), 3))
    cv_df_fs.append(pd.DataFrame({'Attribute': [i for i in score.keys()], f'{name}': mean_score}))

score_fs = pd.DataFrame(columns=['Attribute'])
for i in cv_df_fs:
    score_fs = score_fs.merge(right=i, on='Attribute', how='outer')
score_fs = score_fs.loc[2:].reset_index(drop=True)

print("Feature Selection Model Cross-Validation Results:")
print(score_fs)





model_pipes = {}
# Fitting the feature selection models and evaluating on the test set
for name, model in feature_selection_models.items():
    feature_selector = SelectFromModel(model, threshold='mean')
    
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)
    ])
    
    model_pipe.fit(X_train, y_train)
    model_pipes[name] = model_pipe
    
    # Extract selected feature names
    selected_mask = model_pipe.named_steps['feature_selection'].get_support()
    selected_features = [feature for feature, selected in zip(feature_names, selected_mask) if selected]

    print(f"Selected Features for {name}: {selected_features}")

    # Transform the training and testing sets using the entire pipeline
    X_train_transformed = model_pipe.named_steps['preprocessor'].transform(X_train)
    X_test_transformed = model_pipe.named_steps['preprocessor'].transform(X_test)

    # Transform using the feature selector
    X_train_selected = model_pipe.named_steps['feature_selection'].transform(X_train_transformed)
    X_test_selected = model_pipe.named_steps['feature_selection'].transform(X_test_transformed)

    # Create a new model pipeline with only the selected features
    final_model_pipe = Pipeline(steps=[
        ('model', model)
    ])

    # Fit the final model on the selected features
    final_model_pipe.fit(X_train_selected, y_train)

    # Evaluate the final model on the test set
    y_pred = final_model_pipe.predict(X_test_selected)
    print(f"Classification report for {name}:")
    print(classification_report(y_test, y_pred))

    # Plot feature importances of the selected features
    importances = model_pipe.named_steps['feature_selection'].estimator_.feature_importances_
    selected_importances = [importance for importance, selected in zip(importances, selected_mask) if selected]
    importance_df = pd.DataFrame({'Feature': selected_features, 'Importance': selected_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Feature Importance for {name}')
    plt.show()

Model metrics

In [None]:
# Convert cross-validation results to DataFrame
cv_df_long = pd.melt(score_fs, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Plot performance metrics comparison
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=cv_df_long)
plt.title('Model Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

Learning Curves

In [None]:
# Define a function to plot the learning curve with a specific scoring metric
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), scoring=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel(scoring if scoring else "Score")  # Display the scoring metric used
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    return plt

# Plot learning curves for each model with scoring='recall'
for name, model in feature_selection_models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)
    ])
    plot_learning_curve(model_pipe, f'Learning Curve for {name}', X_train, y_train, cv=5, scoring='recall')
    plt.show()


Logistic Regression

In [None]:
# Combine all ordinal variables into one list and their respective order categories
ordinal_var = []
ord_categories = []
for key in ord_:
    ordinal_var += ord_[key][0]
    ord_categories += [ord_[key][1]] * len(ord_[key][0])

# Define function to find outliers
def find_outliers_IQR(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    IQR = q3 - q1
    upper_bound = q3 + 1.5 * IQR
    lower_bound = q1 - 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return outliers, lower_bound, upper_bound

# Function for handling outliers
def func_handling_outlier(X):
    X = X.copy()
    X['Number of Dependents'] = X['Number of Dependents'].apply(lambda i: 3 if i > 3 else i)
    out_var3 = ['Number of Referrals', 'Total Long Distance Charges', 'Avg Monthly GB Download', 'Total Revenue']
    upper_bound = []
    for i in out_var3:
        upper_bound.append(int(find_outliers_IQR(X[i])[2]))
    for i, j in zip(out_var3, upper_bound):
        X.loc[X[X[i] > j].index.to_list(), i] = np.NaN
    imputer = KNNImputer(n_neighbors=7)
    X[out_var3] = imputer.fit_transform(X[out_var3])
    return X

# Split dataset
X = df_transform[numeric_var + nominal_var + ordinal_var]
y = df_transform['Churn Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Ensure data is in DataFrame format
X_train = pd.DataFrame(X_train, columns=numeric_var + nominal_var + ordinal_var)
X_test = pd.DataFrame(X_test, columns=numeric_var + nominal_var + ordinal_var)

# Preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline(steps=[

    ('encoder', OrdinalEncoder(categories=ord_categories))
])

# Column transformer for preprocessing
prep_var = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numeric_var),
        ('nominal', nominal_pipeline, nominal_var),
        ('ordinal', ordinal_pipeline, ordinal_var)
    ], remainder='passthrough'
)

# Pipeline for handling outliers
hand_outliers = Pipeline(steps=[
    ('handling_outlier', FunctionTransformer(func_handling_outlier))
])

# Combined preprocessor
combined_preprocessor = Pipeline(steps=[
    ('hand_outliers', hand_outliers),
    ('prep_var', prep_var)
])

# Fit the combined preprocessor first
combined_preprocessor.fit(X_train)

# Extract feature names after fitting the combined preprocessor
nominal_feature_names = combined_preprocessor.named_steps['prep_var'].named_transformers_['nominal'].named_steps['encoder'].get_feature_names_out(nominal_var)
feature_names = numeric_var + list(nominal_feature_names) + ordinal_var

# Feature selection with Logistic Regression using RFE
model = LogisticRegression(max_iter=1000, random_state=42)
feature_selector = RFE(model, n_features_to_select=10)  # Select top 10 features

model_pipe = Pipeline(steps=[
    ('preprocessor', combined_preprocessor),
    ('feature_selection', feature_selector),
    ('model', model)
])

cv_df_fs_LR = []

# Perform cross-validation
score = cross_validate(model_pipe, X_train, y_train, cv=10, return_train_score=False, scoring=['roc_auc', 'f1', 'recall', 'precision'])

# Convert cross-validation results to DataFrame
cv_df_fs_LR = pd.DataFrame({
    'Attribute': [i for i in score.keys()],
    'Logistic Regression': [round(np.mean(score[i]), 3) for i in score.keys()]
})

print("Feature Selection Model Cross-Validation Results:")
print(cv_df_fs_LR)

# Directly assign to score_fs_LR
score_fs_LR = cv_df_fs_LR.loc[2:].reset_index(drop=True)

# Fitting the feature selection model and evaluating on the test set
model_pipe.fit(X_train, y_train)

# Extract selected feature names
selected_mask = model_pipe.named_steps['feature_selection'].get_support()
selected_features = [feature for feature, selected in zip(feature_names, selected_mask) if selected]

print(f"Selected Features for Logistic Regression: {selected_features}")

# Transform the training and testing sets using the entire pipeline
X_train_transformed = model_pipe.named_steps['preprocessor'].transform(X_train)
X_test_transformed = model_pipe.named_steps['preprocessor'].transform(X_test)

# Transform using the feature selector
X_train_selected = model_pipe.named_steps['feature_selection'].transform(X_train_transformed)
X_test_selected = model_pipe.named_steps['feature_selection'].transform(X_test_transformed)

# Create a new model pipeline with only the selected features
final_model_pipe = Pipeline(steps=[
    ('model', model)
])

# Fit the final model on the selected features
final_model_pipe.fit(X_train_selected, y_train)

# Evaluate the final model on the test set
y_pred = final_model_pipe.predict(X_test_selected)
print(f"Classification report for Logistic Regression:")
print(classification_report(y_test, y_pred))

Learning Curves

In [None]:

# Plot learning curves for different metrics
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for metric in metrics:
    plot_learning_curve(model_pipe, f'Learning Curve for Logistic Regression ({metric})', X_train, y_train, cv=5, scoring=metric)
    plt.show()


Model Metrics

In [None]:
# Convert the cross-validation results to long format
cv_df_long_LR = pd.melt(cv_df_fs_LR, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Define the desired order of metrics
desired_order = ['test_f1', 'test_precision', 'test_recall', 'test_roc_auc']

# Convert the 'Attribute' column to a categorical type with the specified order
cv_df_long_LR['Attribute'] = pd.Categorical(cv_df_long_LR['Attribute'], categories=desired_order, ordered=True)

# Plot performance metrics comparison
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=cv_df_long_LR)
plt.title('Logistic Regression Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

Combined Models metrics

In [None]:
# Convert Logistic Regression cross-validation results to long format
cv_df_long_LR = pd.melt(cv_df_fs_LR, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Convert other models' cross-validation results to long format
cv_df_long_other = pd.melt(score_fs, id_vars=["Attribute"], var_name="Model", value_name="Score")

# Ensure there are no duplicate entries
cv_df_long_LR = cv_df_long_LR.drop_duplicates(subset=['Attribute', 'Model'])
cv_df_long_other = cv_df_long_other.drop_duplicates(subset=['Attribute', 'Model'])

# Combine the two DataFrames
combined_df_long = pd.concat([cv_df_long_LR, cv_df_long_other], axis=0)

# Reset index to ensure no duplicate indices
combined_df_long = combined_df_long.reset_index(drop=True)

# Define the desired order of metrics
desired_order = ['test_f1', 'test_precision', 'test_recall', 'test_roc_auc']

# Convert the 'Attribute' column to a categorical type with the specified order
combined_df_long['Attribute'] = pd.Categorical(combined_df_long['Attribute'], categories=desired_order, ordered=True)

# Plot performance metrics comparison for all models
plt.figure(figsize=(12, 6))
sns.barplot(x="Attribute", y="Score", hue="Model", data=combined_df_long)
plt.title('Model Performance Metrics Comparison')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.legend(loc='upper right')
plt.show()

ROC AUC Curves

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))
for name, model in feature_selection_models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Plot ROC curve for Logistic Regression
plt.figure(figsize=(10, 8))

# Define the Logistic Regression pipeline
model_pipe = Pipeline(steps=[
    ('preprocessor', combined_preprocessor),
    ('feature_selection', feature_selector),
    ('model', model)
])

# Fit the pipeline on the training data
model_pipe.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.plot(fpr, tpr, lw=2, label=f'Logistic Regression (AUC = {roc_auc:.2f})')

# Plot the diagonal line representing random guessing
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# Set plot limits and labels
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")

# Display the plot
plt.show()

Combined ROC AUC curves

In [None]:
# Initialize the figure
plt.figure(figsize=(10, 8))

# Plot ROC curves for other models in feature_selection_models
for name, model in feature_selection_models.items():
    model_pipe = Pipeline(steps=[
        ('preprocessor', combined_preprocessor),
        ('feature_selection', feature_selector),
        ('model', model)])
    model_pipe.fit(X_train, y_train)
    
    y_pred_proba = model_pipe.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

# Plot ROC curve for Logistic Regression
# Define the Logistic Regression pipeline
model_pipe_lr = Pipeline(steps=[
    ('preprocessor', combined_preprocessor),
    ('feature_selection', feature_selector),
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])

# Fit the pipeline on the training data
model_pipe_lr.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_proba_lr = model_pipe_lr.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC for Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

# Plot the ROC curve for Logistic Regression
plt.plot(fpr_lr, tpr_lr, lw=2, label=f'Logistic Regression (AUC = {roc_auc_lr:.2f})', color='red')  # Use a distinct color

# Plot the diagonal line representing random guessing
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# Set plot limits and labels
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")

# Display the plot
plt.show()

Feature importance

In [None]:
# Visualize feature importance
coefficients = final_model_pipe.named_steps['model'].coef_[0]
importance_df = pd.DataFrame({'Feature': selected_features, 'Coefficient': coefficients})
importance_df = importance_df.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=importance_df)
plt.title('Feature Importance for Logistic Regression')
plt.show()