In [1]:
# Import all necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report, silhouette_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [2]:
# Load the dataset
df = pd.read_csv("TelcoCustomerChurn.csv")

In [3]:
# Preprocessing

# Drop irrelevant columns
df.drop(columns=['customerID'], errors='ignore', inplace=True)

# Convert 'Churn' to binary values
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# One-hot encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Handle missing values
df.fillna(df.median(), inplace=True)

In [4]:
# Splitting features and target variable
X = df.drop(columns=['Churn'])
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)  # Debugging dataset shape

X_train shape: (5634, 6559) y_train shape: (5634,)


In [6]:
# Supervised Learning Models
models = {
    "Baseline - Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "Support Vector Machine": SVC(),
    "Artificial Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
}

In [7]:
# Training and Evaluation
print("\nTraining models...\n")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))


Training models...

Baseline - Logistic Regression Accuracy: 0.7842
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1036
           1       0.61      0.51      0.56       373

    accuracy                           0.78      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.77      0.78      0.78      1409

Decision Tree Accuracy: 0.7807
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1036
           1       0.60      0.50      0.54       373

    accuracy                           0.78      1409
   macro avg       0.72      0.69      0.70      1409
weighted avg       0.77      0.78      0.77      1409

Random Forest Accuracy: 0.7913
              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1036
           1       0.66      0.45      0.53       373

    accuracy                           0.79      14

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7871
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1036
           1       0.62      0.52      0.56       373

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409

Support Vector Machine Accuracy: 0.7353
              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1036
           1       0.50      0.76      0.60       373

    accuracy                           0.74      1409
   macro avg       0.70      0.74      0.70      1409
weighted avg       0.79      0.74      0.75      1409

Artificial Neural Network Accuracy: 0.7480
              precision    recall  f1-score   support

           0       0.75      0.98      0.85      1036
           1       0.63      0.12      0.20       373

    accuracy                           0.75      1409
   macro avg       0

In [8]:
# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100],  # Reduced grid for quick testing
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

try:
    print("\nStarting Grid Search for Random Forest...\n")
    rf_grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
    rf_grid_search.fit(X_train, y_train)
    print("\nGrid Search Completed!")
    print("Best Random Forest Parameters:", rf_grid_search.best_params_)
except Exception as e:
    print("Error occurred during Grid Search:", e)

# Unsupervised Learning - Clustering with K-Means
try:
    print("\nApplying K-Means Clustering...\n")
    kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
    kmeans.fit(X_train)
    kmeans_labels = kmeans.predict(X_test)
    silhouette = silhouette_score(X_test, kmeans_labels)
    print(f"K-Means Silhouette Score: {silhouette:.4f}")
except Exception as e:
    print("Error occurred during K-Means:", e)

print("\nScript completed successfully!")


Starting Grid Search for Random Forest...

Fitting 3 folds for each of 8 candidates, totalling 24 fits

Grid Search Completed!
Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}

Applying K-Means Clustering...

K-Means Silhouette Score: 0.2379

Script completed successfully!
