## MODEL BUILDING – CUSTOMER CHURN ANALYSIS

## Import required libraries for data handling, modeling, and evaluation

In [1]:
import pandas as pd                                      # For data loading and manipulation
from sklearn import metrics                              # For model evaluation metrics
from sklearn.model_selection import train_test_split     # To split the dataset into train and test sets
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier          # Decision Tree model
from imblearn.combine import SMOTEENN                    # For handling imbalanced datasets (SMOTE + ENN)

## Load and analysing the preprocessed Telco Customer Churn dataset

In [2]:
# Load the dataset
df = pd.read_csv("tel_churn.csv")

In [3]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,True,False,False,True,True,False,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.5,0,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,1,False,True,True,False,True,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.3,1840.75,0,False,True,True,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,0,70.7,151.65,1,True,False,True,False,True,False,...,False,False,True,False,True,False,False,False,False,False


In [4]:
# Define feature set (X) and target variable (y)
# X → All independent variables
# y → Dependent variable ('Churn')

In [5]:
x = df.drop('Churn', axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [6]:
y = df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [7]:
# Split the dataset into training and testing sets
# test_size=0.2 → 20% of data for testing, 80% for training
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## Decision Tree Classifier

In [8]:
# ------------------------------------------------------------
# Initialize the Decision Tree Classifier
# Parameters:
# - criterion = "gini" → Gini impurity used for splitting
# - random_state = 100 → For reproducibility
# - max_depth = 6 → Limits tree depth to avoid overfitting
# - min_samples_leaf = 8 → Minimum samples required in a leaf node
# ------------------------------------------------------------
model_dt = DecisionTreeClassifier(
    criterion="gini",
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)

In [9]:
# Train (fit) the Decision Tree model on training data
model_dt.fit(x_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,100
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [10]:
# Predict churn values on the test dataset
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(1407,))

In [11]:
# Evaluate model performance on the test dataset

In [12]:
print("Decision Tree Model Accuracy (Before Resampling):", model_dt.score(x_test, y_test))

Decision Tree Model Accuracy (Before Resampling): 0.7995735607675906


In [13]:
print("\nClassification Report (Before Resampling):")
print(classification_report(y_test, y_pred, labels=[0, 1]))


Classification Report (Before Resampling):
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1039
           1       0.64      0.53      0.58       368

    accuracy                           0.80      1407
   macro avg       0.74      0.71      0.72      1407
weighted avg       0.79      0.80      0.79      1407



In [14]:
# ------------------------------------------------------------
# Handle Class Imbalance using SMOTEENN (Hybrid of SMOTE + ENN)
# - SMOTE (Synthetic Minority Oversampling Technique) → Adds synthetic samples for minority class
# - ENN (Edited Nearest Neighbors) → Removes noisy samples to clean the dataset
# ------------------------------------------------------------
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x, y)

In [15]:
# Split the resampled dataset into training and testing sets
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [16]:
# Initialize and train a new Decision Tree model on the resampled dataset
model_dt_smote = DecisionTreeClassifier(
    criterion="gini",
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)

In [17]:
model_dt_smote.fit(xr_train, yr_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,100
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [18]:
# Make predictions using the SMOTEENN-trained model
yr_predict = model_dt_smote.predict(xr_test)

In [19]:
# Evaluate the improved model after balancing the dataset
model_score_r = model_dt_smote.score(xr_test, yr_test)
print("\nDecision Tree Model Accuracy (After SMOTEENN):", model_score_r)


Decision Tree Model Accuracy (After SMOTEENN): 0.9276595744680851


In [20]:
print("\nClassification Report (After SMOTEENN):")
print(metrics.classification_report(yr_test, yr_predict))


Classification Report (After SMOTEENN):
              precision    recall  f1-score   support

           0       0.93      0.90      0.92       511
           1       0.93      0.95      0.94       664

    accuracy                           0.93      1175
   macro avg       0.93      0.92      0.93      1175
weighted avg       0.93      0.93      0.93      1175



In [21]:
print("\nConfusion Matrix (After SMOTEENN):")
print(metrics.confusion_matrix(yr_test, yr_predict))


Confusion Matrix (After SMOTEENN):
[[461  50]
 [ 35 629]]


## Random Forest Classifier

In [22]:
# Import the Random Forest Classifier from scikit-learn
from sklearn.ensemble import RandomForestClassifier

In [23]:
# ------------------------------------------------------------
# Initialize the Random Forest Classifier
# Parameters:
# - n_estimators = 100 → Number of trees in the forest
# - criterion = 'gini' → Splitting criterion
# - random_state = 100 → Ensures reproducibility
# - max_depth = 6 → Limits tree depth to prevent overfitting
# - min_samples_leaf = 8 → Minimum samples required in a leaf node
# ------------------------------------------------------------
model_rf = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)

In [24]:
# Train the Random Forest model on the original training data
model_rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
# Predict churn values on the test set
y_pred = model_rf.predict(x_test)

In [26]:
# Evaluate model performance on the test set (before resampling)
print("Random Forest Accuracy (Before Resampling):", model_rf.score(x_test, y_test))
print("\nClassification Report (Before Resampling):")
print(classification_report(y_test, y_pred, labels=[0, 1]))

Random Forest Accuracy (Before Resampling): 0.8173418621179815

Classification Report (Before Resampling):
              precision    recall  f1-score   support

           0       0.83      0.95      0.88      1039
           1       0.75      0.46      0.57       368

    accuracy                           0.82      1407
   macro avg       0.79      0.70      0.73      1407
weighted avg       0.81      0.82      0.80      1407



In [27]:
# ------------------------------------------------------------
# Handle Class Imbalance using SMOTEENN (resample the dataset)
# - SMOTE → Oversample minority class
# - ENN → Remove noisy samples
# ------------------------------------------------------------
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x, y)

In [28]:
# Split the resampled dataset into training and testing sets
xr_train1, xr_test1, yr_train1, yr_test1 = train_test_split(
    X_resampled1, y_resampled1, test_size=0.2
)

In [29]:
# Initialize a new Random Forest model for the resampled dataset
model_rf_smote = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)

In [30]:
# Train the model on the resampled training data
model_rf_smote.fit(xr_train1, yr_train1)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [31]:
# Predict churn on the resampled test set
yr_predict1 = model_rf_smote.predict(xr_test1)

In [32]:
# Evaluate the Random Forest model performance after balancing

In [33]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)
print("\nRandom Forest Accuracy (After SMOTEENN):", model_score_r1)


Random Forest Accuracy (After SMOTEENN): 0.931063829787234


In [34]:
print("\nClassification Report (After SMOTEENN):")
print(metrics.classification_report(yr_test1, yr_predict1))


Classification Report (After SMOTEENN):
              precision    recall  f1-score   support

           0       0.95      0.90      0.92       551
           1       0.91      0.96      0.94       624

    accuracy                           0.93      1175
   macro avg       0.93      0.93      0.93      1175
weighted avg       0.93      0.93      0.93      1175



In [35]:
print("\nConfusion Matrix (After SMOTEENN):")
print(metrics.confusion_matrix(yr_test1, yr_predict1))


Confusion Matrix (After SMOTEENN):
[[495  56]
 [ 25 599]]


## DIMENSIONALITY REDUCTION USING PCA AND MODELING

In [36]:
# Import Principal Component Analysis (PCA) from scikit-learn
from sklearn.decomposition import PCA

In [37]:
# ------------------------------------------------------------
# Initialize PCA
# - n_components=0.9 → Retain 90% of the variance
# This reduces the number of features while keeping most information
# ------------------------------------------------------------
pca = PCA(0.9)

# Fit PCA on the resampled training data and transform it
xr_train_pca = pca.fit_transform(xr_train1)

# Apply the same transformation to the test data
xr_test_pca = pca.transform(xr_test1)

# Store the proportion of variance explained by each principal component
explained_variance = pca.explained_variance_ratio_
print("Explained variance ratio of components:", explained_variance)

Explained variance ratio of components: [0.99985192]


In [38]:
# ------------------------------------------------------------
# Initialize Random Forest Classifier
# Same hyperparameters as before
# ------------------------------------------------------------
model = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)

In [39]:
# Train the Random Forest model on PCA-transformed training data
model.fit(xr_train_pca, yr_train1)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [40]:
# Predict churn on the PCA-transformed test data
yr_predict_pca = model.predict(xr_test_pca)

In [41]:
# Evaluate model performance after PCA

model_score_r_pca = model.score(xr_test_pca, yr_test1)
print("\nRandom Forest Accuracy (After PCA + SMOTEENN):", model_score_r_pca)
print("\nClassification Report (After PCA + SMOTEENN):")
print(metrics.classification_report(yr_test1, yr_predict_pca))


Random Forest Accuracy (After PCA + SMOTEENN): 0.7157446808510638

Classification Report (After PCA + SMOTEENN):
              precision    recall  f1-score   support

           0       0.70      0.70      0.70       551
           1       0.73      0.73      0.73       624

    accuracy                           0.72      1175
   macro avg       0.71      0.71      0.71      1175
weighted avg       0.72      0.72      0.72      1175



## SAVING AND LOADING THE TRAINED MODEL USING PICKLE

In [42]:
# Importing python library for serializing and deserializing objects
import pickle 

In [43]:
# Define the filename to save the trained model
filename = 'model.sav'

In [44]:
# Save the trained Random Forest model (after SMOTEENN) to disk
# 'wb' → write binary mode
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [45]:
# Load the saved model from disk
# 'rb' → read binary mode
load_model = pickle.load(open(filename, 'rb'))

In [46]:
# Evaluate the loaded model on the test data
# Ensures that the saved model retains its performance
model_score_r1 = load_model.score(xr_test1, yr_test1)
print("Accuracy of Loaded Model:", model_score_r1)

Accuracy of Loaded Model: 0.931063829787234
