In [118]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import pickle
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [90]:
# Load the dataset
data_path = "/content/telecom_churn.csv"
df = pd.read_csv(data_path)

In [91]:
# Convert 'date_of_registration' to datetime and create 'tenure_days'
df['date_of_registration'] = pd.to_datetime(df['date_of_registration'])
df['tenure_days'] = (pd.Timestamp.today() - df['date_of_registration']).dt.days

# Fix negative values in numerical columns
df[['calls_made', 'sms_sent', 'data_used']] = df[['calls_made', 'sms_sent', 'data_used']].clip(lower=0)

In [92]:
# Display basic dataset info
print("Dataset Shape:", df.shape)
print("First 5 rows:\n", df.head())

Dataset Shape: (243553, 15)
First 5 rows:
    customer_id telecom_partner gender  age              state     city  \
0            1    Reliance Jio      F   25          Karnataka  Kolkata   
1            2    Reliance Jio      F   55            Mizoram   Mumbai   
2            3        Vodafone      F   57  Arunachal Pradesh    Delhi   
3            4            BSNL      M   46         Tamil Nadu  Kolkata   
4            5            BSNL      F   26            Tripura    Delhi   

   pincode date_of_registration  num_dependents  estimated_salary  calls_made  \
0   755597           2020-01-01               4            124962          44   
1   125926           2020-01-01               2            130556          62   
2   423976           2020-01-01               0            148828          49   
3   522841           2020-01-01               1             38722          80   
4   740247           2020-01-01               2             55098          78   

   sms_sent  data_used  c

In [93]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 customer_id             0
telecom_partner         0
gender                  0
age                     0
state                   0
city                    0
pincode                 0
date_of_registration    0
num_dependents          0
estimated_salary        0
calls_made              0
sms_sent                0
data_used               0
churn                   0
tenure_days             0
dtype: int64


In [94]:
# Encode categorical variables
categorical_cols = ['telecom_partner', 'gender', 'state', 'city']
label_encoders = {col: LabelEncoder().fit(df[col].astype(str)) for col in categorical_cols}
for col, le in label_encoders.items():
    df[col] = le.transform(df[col].astype(str))

In [96]:
# Handle missing values
df.fillna(df.median(), inplace=True)

In [97]:
# Log-transform skewed features
df[['calls_made', 'sms_sent', 'data_used', 'tenure_days']] = np.log1p(df[['calls_made', 'sms_sent', 'data_used', 'tenure_days']])

In [98]:
# Drop unnecessary columns
df.drop(columns=['customer_id', 'pincode', 'date_of_registration'], inplace=True, errors='ignore')

In [99]:
# Define features and target
X, y = df.drop(columns=['churn']), df['churn']

In [100]:
# Handle class imbalance using ADASYN
adasyn = ADASYN(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

In [101]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [102]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)

In [103]:
# Define Neural Network Model
nn_model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [104]:
# Compile the model
nn_model.compile(optimizer=keras.optimizers.AdamW(learning_rate=0.0005, weight_decay=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [105]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

In [106]:
# Train the model
nn_model.fit(X_train_scaled, y_train, epochs=150, batch_size=128, validation_split=0.2, verbose=1, callbacks=[early_stopping, lr_scheduler])

Epoch 1/150
[1m1953/1953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 13ms/step - accuracy: 0.6208 - loss: 0.6783 - val_accuracy: 0.6745 - val_loss: 0.6109 - learning_rate: 5.0000e-04
Epoch 2/150
[1m1953/1953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 14ms/step - accuracy: 0.6654 - loss: 0.6188 - val_accuracy: 0.6781 - val_loss: 0.6076 - learning_rate: 5.0000e-04
Epoch 3/150
[1m1953/1953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 14ms/step - accuracy: 0.6713 - loss: 0.6124 - val_accuracy: 0.6810 - val_loss: 0.6032 - learning_rate: 5.0000e-04
Epoch 4/150
[1m1953/1953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 14ms/step - accuracy: 0.6771 - loss: 0.6077 - val_accuracy: 0.6861 - val_loss: 0.5997 - learning_rate: 5.0000e-04
Epoch 5/150
[1m1953/1953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 14ms/step - accuracy: 0.6800 - loss: 0.6038 - val_accuracy: 0.6878 - val_loss: 0.5971 - learning_rate: 5.0000e-04
Epoch 6/150
[1m1953/1953

<keras.src.callbacks.history.History at 0x79d27e5ced10>

In [107]:
# Make predictions
y_pred_probs_nn = nn_model.predict(X_test_scaled).flatten()
y_pred_nn = (y_pred_probs_nn >= 0.5).astype(int)

[1m2441/2441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step


In [108]:
# Evaluate the model
accuracy_nn = accuracy_score(y_test, y_pred_nn)
balanced_acc_nn = balanced_accuracy_score(y_test, y_pred_nn)
roc_auc_nn = roc_auc_score(y_test, y_pred_probs_nn)

print(f"Neural Network - Accuracy: {accuracy_nn:.4f}\nBalanced Accuracy: {balanced_acc_nn:.4f}\nROC AUC Score: {roc_auc_nn:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nn))

Neural Network - Accuracy: 0.7019
Balanced Accuracy: 0.7020
ROC AUC Score: 0.7617
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.72      0.71     38945
           1       0.71      0.68      0.70     39145

    accuracy                           0.70     78090
   macro avg       0.70      0.70      0.70     78090
weighted avg       0.70      0.70      0.70     78090



In [109]:
# Train XGBoost model
xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

In [110]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [114]:
# Train Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
gb_model.fit(X_train_scaled, y_train)

In [115]:
# Define Stacking Classifier
estimators = [('rf', rf_model), ('xgb', xgb_model), ('gb', gb_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5)
stacking_model.fit(X_train_scaled, y_train)

In [116]:
# Evaluate Stacking Model
y_pred_stack = stacking_model.predict(X_test_scaled)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
balanced_acc_stack = balanced_accuracy_score(y_test, y_pred_stack)
roc_auc_stack = roc_auc_score(y_test, stacking_model.predict_proba(X_test_scaled)[:, 1])

print(f"Stacking Model - Accuracy: {accuracy_stack:.4f}\nBalanced Accuracy: {balanced_acc_stack:.4f}\nROC AUC Score: {roc_auc_stack:.4f}")

Stacking Model - Accuracy: 0.8030
Balanced Accuracy: 0.8032
ROC AUC Score: 0.8570


In [120]:
# Save models and scaler
nn_model.save("telecom_churn_nn_model_optimized.keras")
pickle.dump(scaler, open("scaler_optimized.pkl", "wb"))
pickle.dump(stacking_model, open("stacking_model.pkl", "wb"))

print("Optimized Models and Scaler saved successfully!")

Optimized Models and Scaler saved successfully!
