In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_PATH = os.path.join("..","datasets","prepared_data","telco_data.csv")
telco = pd.read_csv(DATA_PATH)
telco.columns

(7043, 11)

In [3]:
X = telco.drop(columns=['churn'])
y = telco.churn

# Train, Validation, and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=12)

In [4]:

# Numeric and categorical feature lists
numeric_features = ['tenure', 'monthlycharges', 'totalcharges']
categorical_features = ['contract', 'internetservice', 'paymentmethod',
                        'onlinesecurity', 'techsupport', 'phoneservice', 'paperlessbilling']

# Define the preprocessor: Apply the replacement first, then scale
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),  # Replace NaNs with 0
            ('scaler', StandardScaler())  # Standardize numeric columns
        ]), numeric_features),
        
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encode categorical features
    ])

# Apply transformations to the training and test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Ensure y_train is a 1D array
y_train = y_train.ravel()

In [14]:
# Verify the balance before SMOTE
print(f"Before SMOTE: 0:{y_train[y_train==0].shape}, 1:{y_train[y_train==1].shape}")

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

# Verify the balance after SMOTE
print(f"After SMOTE: 0:{y_train_resampled[y_train_resampled==0].shape}, 1:{y_train_resampled[y_train_resampled==1].shape}")

X_train_resampled = pd.DataFrame(X_train_resampled)
y_train_resampled = pd.DataFrame(y_train_resampled)
X_test_transformed = pd.DataFrame(X_test_transformed)
y_test = pd.DataFrame(y_test)

X_train_resampled.to_csv(os.path.join('..','datasets','prepared_data','X_train_resampled.csv'), index=False)
X_test_transformed.to_csv(os.path.join('..','datasets','prepared_data','X_test_transformed.csv'), index=False)
y_train_resampled.to_csv(os.path.join('..','datasets','prepared_data','y_train_resampled.csv'), index=False)
y_test.to_csv(os.path.join('..','datasets','prepared_data','y_test.csv'), index=False)

Before SMOTE:, 0:(3622,), 1:(1308,)
After SMOTE:, 0:(3622,), 1:(3622,)


In [6]:
# Models to check
models = {
    'svc': SVC(), 
    'logistic': LogisticRegression(),
    'random_forest': RandomForestClassifier(),
    'knn': KNeighborsClassifier(),
    'xgb': XGBClassifier(),
    'gnb': GaussianNB()
}


def evaluate_models(models:dict, X:pd.DataFrame, y:pd.Series, cv:int) -> pd.DataFrame:
    n_models = len(models)
    scores=[]
    for name, model in models.items():
        y_train_pred = cross_val_predict(model, X, y, cv=cv)
        accuracy = round(accuracy_score(y, y_train_pred), 2)
        precision = precision_score(y, y_train_pred).round(2)
        recall = recall_score(y, y_train_pred).round(2)
        f1 = f1_score(y, y_train_pred).round(2)

        scores.append([name, accuracy, precision, recall, f1])
          
    scores_df = pd.DataFrame(scores, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])
    scores_df.set_index('Model', inplace=True)
    
    return scores_df.sort_values(by='Recall', ascending=False)

In [7]:
evaluate_models(models, X_train_resampled, y_train_resampled, 10)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
knn,0.79,0.76,0.86,0.81
random_forest,0.83,0.82,0.85,0.83
xgb,0.82,0.8,0.85,0.83
gnb,0.76,0.72,0.84,0.77
logistic,0.77,0.75,0.82,0.78
svc,0.78,0.76,0.81,0.78
