In [1]:
# Import necessary libraries
import pandas as pd
import pickle as pkl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

In [5]:
data = pd.read_csv('customer_churn_dataset-testing-master.csv')
data

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0
...,...,...,...,...,...,...,...,...,...,...,...,...
64369,64370,45,Female,33,12,6,21,Basic,Quarterly,947,14,1
64370,64371,37,Male,6,1,5,22,Standard,Annual,923,9,1
64371,64372,25,Male,39,14,8,30,Premium,Monthly,327,20,1
64372,64373,50,Female,18,19,7,22,Standard,Monthly,540,13,1


In [6]:
# Display the first few rows of the dataset
print("Dataset Overview:")
print(data.head())

Dataset Overview:
   CustomerID  Age  Gender  Tenure  Usage Frequency  Support Calls  \
0           1   22  Female      25               14              4   
1           2   41  Female      28               28              7   
2           3   47    Male      27               10              2   
3           4   35    Male       9               12              5   
4           5   53  Female      58               24              9   

   Payment Delay Subscription Type Contract Length  Total Spend  \
0             27             Basic         Monthly          598   
1             13          Standard         Monthly          584   
2             29           Premium          Annual          757   
3             17           Premium       Quarterly          232   
4              2          Standard          Annual          533   

   Last Interaction  Churn  
0                 9      1  
1                20      0  
2                21      0  
3                18      0  
4            

In [7]:
# Check for missing values
print("\nMissing Values Summary:")
print(data.isnull().sum())


Missing Values Summary:
CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64


In [8]:
# Data Preprocessing
# Assuming 'Churn' is the target column
# Replace categorical features with dummy/one-hot encoding
data = pd.get_dummies(data, drop_first=True, dtype = float)

# Split the data into features (X) and target (y)
X = data.drop(columns=['Churn'])  # Update if 'Churn' is named differently
y = data['Churn']                # Update if the target column has a different name


In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Model training
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [8]:
# Model evaluation
y_pred = rf_model.predict(X_test)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[10162     3]
 [   57  9091]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     10165
           1       1.00      0.99      1.00      9148

    accuracy                           1.00     19313
   macro avg       1.00      1.00      1.00     19313
weighted avg       1.00      1.00      1.00     19313



In [9]:
# ROC-AUC score
y_proba = rf_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)
print("\nROC-AUC Score:", roc_auc)


ROC-AUC Score: 0.9999750455481925


In [10]:
# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop 5 Important Features:")
print(feature_importance.head(6))


Top 5 Important Features:
           Feature  Importance
5    Payment Delay    0.329177
0       CustomerID    0.277913
4    Support Calls    0.121178
2           Tenure    0.084614
8      Gender_Male    0.052494
3  Usage Frequency    0.045638


In [11]:
# Excel file of feature imp
feature_importance.to_excel('feature_importances.xlsx')

In [12]:
# Save preprocessed data for Power BI visualization
preprocessed_file = 'preprocessed_churn_data.csv'
data.to_csv(preprocessed_file, index=False)
print(f"\nPreprocessed data saved at: {preprocessed_file}")


Preprocessed data saved at: preprocessed_churn_data.csv


In [13]:
# Saving the model
model_filename = "rf_churn_model.pkl"
with open(model_filename, "wb") as file:
    pkl.dump(rf_model, file)

In [9]:

# Train the data in small inputs so that it can be easy to predict

x_small = data[['Payment Delay','Tenure','Usage Frequency','Gender_Male','Support Calls']]
y = data['Churn']

In [10]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_small, y, test_size = 0.2, random_state=42 )

In [11]:
rf_mod = RandomForestClassifier(random_state=42)
rf_mod.fit(x_train1, y_train1)

In [12]:
y_pred = rf_mod.predict(x_test1)

In [13]:
print(classification_report(y_test1, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93      6793
           1       0.91      0.95      0.93      6082

    accuracy                           0.93     12875
   macro avg       0.93      0.93      0.93     12875
weighted avg       0.93      0.93      0.93     12875



In [14]:
joblib.dump(rf_mod, 'small_churn_model.pkl', compress=3)
print("Model saved successfully!")

Model saved successfully!
