In [1]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 2: Load dataset
file_path = "D:\cts datasets\CDR-Call-Details.csv" # replace with your file path
df = pd.read_csv(file_path)

# Display basic info
print(df.info())
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101174 entries, 0 to 101173
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Phone Number    101174 non-null  object 
 1   Account Length  101174 non-null  int64  
 2   VMail Message   101174 non-null  int64  
 3   Day Mins        101174 non-null  float64
 4   Day Calls       101174 non-null  int64  
 5   Day Charge      101174 non-null  float64
 6   Eve Mins        101174 non-null  float64
 7   Eve Calls       101174 non-null  int64  
 8   Eve Charge      101174 non-null  float64
 9   Night Mins      101174 non-null  float64
 10  Night Calls     101174 non-null  int64  
 11  Night Charge    101174 non-null  float64
 12  Intl Mins       101174 non-null  float64
 13  Intl Calls      101174 non-null  int64  
 14  Intl Charge     101174 non-null  float64
 15  CustServ Calls  101174 non-null  int64  
 16  Churn           101174 non-null  bool   
dtypes: bool(1)

  file_path = "D:\cts datasets\CDR-Call-Details.csv" # replace with your file path


Unnamed: 0,Phone Number,Account Length,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn
0,382-4657,128,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,371-7191,107,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,358-1921,137,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,375-9999,84,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,330-6626,75,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [3]:
# Step 3: Drop unnecessary columns
df = df.drop(columns=["Phone Number"])   # Phone Number not useful

# Features and Target
X = df.drop(columns=["Churn"])
y = df["Churn"].astype(int)  # convert bool → int


In [4]:
# Step 4: Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (80939, 15)
Test set size: (20235, 15)


In [5]:
# Step 5: Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200, 
    max_depth=15, 
    min_samples_split=5, 
    min_samples_leaf=3, 
    random_state=42,
    class_weight="balanced",  # handles class imbalance
    n_jobs=-1
)

rf.fit(X_train, y_train)


In [6]:
# Step 6: Predictions
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # churn probability

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("✅ Accuracy:", accuracy)
print("✅ ROC-AUC:", roc_auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.9363479120336051
✅ ROC-AUC: 0.9565964863922096

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.96     18129
           1       0.64      0.88      0.74      2106

    accuracy                           0.94     20235
   macro avg       0.81      0.91      0.85     20235
weighted avg       0.95      0.94      0.94     20235



In [8]:
# Step 7: Feature importance as percentages (no graph)
importances = rf.feature_importances_
features = X.columns

# Convert to DataFrame
feat_imp = pd.DataFrame({
    "Feature": features,
    "Importance": importances
})

# Convert to percentage
feat_imp["Importance (%)"] = (feat_imp["Importance"] * 100).round(2)

# Sort by importance
feat_imp = feat_imp.sort_values(by="Importance", ascending=False)

# Show top 10
print("Top 10 Important Features Driving Churn (in %):\n")
print(feat_imp[["Feature", "Importance (%)"]].head(10))


Top 10 Important Features Driving Churn (in %):

           Feature  Importance (%)
2         Day Mins           11.92
4       Day Charge           10.35
10    Night Charge            7.45
7       Eve Charge            7.38
5         Eve Mins            7.28
8       Night Mins            6.77
11       Intl Mins            6.57
13     Intl Charge            6.32
3        Day Calls            6.05
0   Account Length            5.92


In [16]:
# Step 8: Final Clean Output with Top 10 Features

# ---- Get top 10 features ----
importances = rf.feature_importances_
features = X.columns

feat_imp = pd.DataFrame({
    "Feature": features,
    "Importance": importances
})
feat_imp["Importance (%)"] = (feat_imp["Importance"] * 100).round(2)
feat_imp = feat_imp.sort_values(by="Importance", ascending=False).head(10)

# Convert top features into a single string
top_features_str = ", ".join(
    feat_imp.apply(lambda row: f"{row['Feature']} ({row['Importance (%)']}%)", axis=1)
)

# ---- Create fresh final output (no original dataset, no predicted churn) ----
final_output = pd.DataFrame({
    "Customer_Index": range(len(y_test)),                # unique index
    "Churn_Probability": y_prob.flatten(),              # probability
    "Top_10_Features": [top_features_str] * len(y_test)  # same features for all
})

# ✅ Show only clean output
print(final_output.head(10))

# ✅ Save only clean version
final_output.to_csv("final_churn_output.csv", index=False)


   Customer_Index  Churn_Probability  \
0               0           0.287939   
1               1           0.300219   
2               2           0.380437   
3               3           0.665324   
4               4           0.137679   
5               5           0.153241   
6               6           0.599749   
7               7           0.023886   
8               8           0.299835   
9               9           0.189959   

                                     Top_10_Features  
0  Day Mins (11.92%), Day Charge (10.35%), Night ...  
1  Day Mins (11.92%), Day Charge (10.35%), Night ...  
2  Day Mins (11.92%), Day Charge (10.35%), Night ...  
3  Day Mins (11.92%), Day Charge (10.35%), Night ...  
4  Day Mins (11.92%), Day Charge (10.35%), Night ...  
5  Day Mins (11.92%), Day Charge (10.35%), Night ...  
6  Day Mins (11.92%), Day Charge (10.35%), Night ...  
7  Day Mins (11.92%), Day Charge (10.35%), Night ...  
8  Day Mins (11.92%), Day Charge (10.35%), Night ...  
9  Day Mi

In [20]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss

# Step 8: Model Evaluation
accuracy = accuracy_score(y_test, rf.predict(X_test))
roc_auc = roc_auc_score(y_test, y_prob)
f1 = f1_score(y_test, rf.predict(X_test))
loss = log_loss(y_test, y_prob)

print("🔎 Model Evaluation Metrics")
print(f"Accuracy      : {accuracy*100:.2f}%")
print(f"ROC-AUC Score : {roc_auc*100:.2f}%")
print(f"F1 Score      : {f1*100:.2f}%")
print(f"Log Loss      : {loss:.4f} ({loss*100:.2f}%)")


🔎 Model Evaluation Metrics
Accuracy      : 93.63%
ROC-AUC Score : 95.66%
F1 Score      : 74.30%
Log Loss      : 0.2902 (29.02%)


In [21]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf, "churn_model.pkl")

print("✅ Model saved as churn_model.pkl")


✅ Model saved as churn_model.pkl
