In [None]:
1. Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE  # SMOTE for resampling
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV

In [None]:
2. Load the Dataset
The dataset is loaded from a CSV file. The dataset contains information about customers, including features such as demographics, service usage, and 
whether the customer churned.

In [None]:
3. Data Exploration
# Data Exploration
print("Dataset Shape:", df.shape)
print("Dataset Info:")
df.info()
Dataset Shape: (7043, 21)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB

In [None]:
4. Check for Missing Values
# Checking for missing values
print("Missing values:")
print(df.isnull().sum())

Missing values:
customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [None]:
5. Handle Missing Values
# Handle missing values (if any)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())  # Fix chained assignment warning

In [None]:
6. Convert Categorical Columns to Numeric
# Convert categorical columns to numerical ones
binary_columns = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
for col in binary_columns:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
7. One-Hot Encoding for Non-Binary Categorical Variables
                   # One-hot encoding for non-binary categorical variables
df = pd.get_dummies(df, columns=['gender', 'InternetService', 'Contract', 'PaymentMethod'], drop_first=True)
                   

In [None]:
8. Split Data into Features and Target
# Split data into features (X) and target (y)
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [None]:
9. Train-Test Split
# Visualize the class distribution before SMOTE
plt.figure(figsize=(6, 4))
sns.countplot(x=y_train)
plt.title('Class Distribution Before SMOTE')
plt.show()

In [None]:
10. Handle Class Imbalance: Apply SMOTE
# Handle class imbalance: Apply SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
11. Define Models
# Define the models
models = {
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(scale_pos_weight=len(y_train_resampled) / sum(y_train_resampled == 0), random_state=42),
    'LightGBM': lgb.LGBMClassifier(class_weight='balanced', random_state=42)

In [None]:
16. Train the Best Models
# Use the best models
best_rf = grid_search_rf.best_estimator_
best_logreg = grid_search_logreg.best_estimator_
best_xgb = grid_search_xgb.best_estimator_
best_lgb = grid_search_lgb.best_estimator_

# Train the models
best_rf.fit(X_train_resampled, y_train_resampled)
best_logreg.fit(X_train_resampled, y_train_resampled)
best_xgb.fit(X_train_resampled, y_train_resampled)
best_lgb.fit(X_train_resampled, y_train_resampled)

In [None]:
Visualization of confusion matrix
 print("Accuracy Score:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", cm)
for model_name, model in models_calibrated.items():
    print(f"\nEvaluating {model_name}...")
    y_pred = model.predict(X_test)
     # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()