**1. Importing the dependencies**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler

: 

**2. Data Loading and Understanding**

In [None]:
# load teh csv data to a pandas dataframe
download_url = f"https://drive.google.com/uc?id=1mjx6tl9MD08uYt_CPWygESL-AOt6avuh"
df=pd.read_csv(download_url)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
# dropping customerID column as this is not required for modelling
df = df.drop(columns=["CustomerID"])

In [None]:
df.head()

In [None]:
df.columns

In [None]:
print(df["Gender"].unique())

In [None]:
# printing the unique values in all the columns

numerical_features_list = ["Last Interaction", "Payment Delay", "Tenure","Usage Frequency",	"Support Calls"]

for col in df.columns:
  if col not in numerical_features_list:
    print(col, df[col].unique())
    print("-"*50)

In [None]:
print(df.isnull().sum())

In [None]:
df.fillna(method='ffill', inplace=True)

In [None]:
print(df.isna().sum())

In [None]:
df[df["Support Calls"]==10.0]

In [None]:
df.info()

In [None]:
# checking the class distribution of target column
print(df["Churn"].value_counts())

**Insights:**
1. Customer ID removed as it is not required for modelling
2. No mmissing values in the dataset
3. Missing values in the TotalCharges column were replaced with 0
4. Class imbalance identified in the target

**3. Exploratory Data Analysis (EDA)**

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(2)

In [None]:
df.describe()

**Numerical Features - Analysis**

Understand the distribution of teh numerical features

In [None]:
def plot_histogram(df, column_name):

  plt.figure(figsize=(5, 3))
  sns.histplot(df[column_name], kde=True)
  plt.title(f"Distribution of {column_name}")

  # calculate the mean and median values for the columns
  col_mean = df[column_name].mean()
  col_median = df[column_name].median()

  # add vertical lines for mean and median
  plt.axvline(col_mean, color="red", linestyle="--", label="Mean")
  plt.axvline(col_median, color="green", linestyle="-", label="Median")

  plt.legend()

  plt.show()

In [None]:
plot_histogram(df, "Tenure")

In [None]:
plot_histogram(df, "Total Spend")

In [None]:
plot_histogram(df, "Payment Delay")

**Box plot for numerical features**

In [None]:
def plot_boxplot(df, column_name):

  plt.figure(figsize=(5, 3))
  sns.boxplot(y=df[column_name])
  plt.title(f"Box Plot of {column_name}")
  plt.ylabel(column_name)
  plt.show

In [None]:
plot_boxplot(df, "Payment Delay")

In [None]:
plot_boxplot(df, "Total Spend")

**Correlation Heatmap for numerical columns**

In [None]:
df_encoded = df.copy()
label_encoders = {}
for col in ['Gender', 'Subscription Type', 'Contract Length']:
    label_encoders[col] = LabelEncoder()
    df_encoded[col] = label_encoders[col].fit_transform(df_encoded[col])

In [None]:
# correlation matrix - heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_encoded.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
df.head()

Categorical features - Analysis

In [None]:
df = df.drop(columns=["Subscription Type"])
df = df.drop(columns=["Usage Frequency"])
df = df.drop(columns=["Tenure"])
df = df.drop(columns=["Contract Length"])

In [None]:
df.columns

In [None]:
df.info()

**4. Data Preprocessing**

In [None]:
df.head(3)

Label encoding of target column

In [None]:
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

In [None]:
df.head(3)

In [None]:
print(df["Churn"].value_counts())

Label encoding of categorical fetaures

In [None]:
# identifying columns with object data type
object_columns = df.select_dtypes(include="object").columns

In [None]:
print(object_columns)

In [None]:
# initialize a dictionary to save the encoders
encoders = {}

# apply label encoding and store the encoders
for column in object_columns:
  label_encoder = LabelEncoder()
  df[column] = label_encoder.fit_transform(df[column])
  encoders[column] = label_encoder


# save the encoders to a pickle file
with open("encoders.pkl", "wb") as f:
  pickle.dump(encoders, f)


In [None]:
encoders

In [None]:
df.head()

**Traianing and test data split**

In [None]:
# Selecting only numeric columns for scaling
numeric_columns = ['Age', 'Support Calls',
                   'Payment Delay', 'Total Spend', 'Last Interaction']  #, 'Usage Frequency' 'Tenure',
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
print(df.head())
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [None]:
X, y = df.drop(columns=["Churn"]), df["Churn"]

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check class distribution
print(pd.Series(y_resampled).value_counts())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Predict probabilities for the test set
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Logistic Regression')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming y_test and y_pred are your true and predicted labels
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-Churn (0)', 'Churn (1)'],
            yticklabels=['Non-Churn (0)', 'Churn (1)'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
with open("churn_model.pkl", "wb") as f:
    pickle.dump(dt, f)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plot_tree(dt,
          feature_names=X_train.columns,
          class_names=['No Churn', 'Churn'],
          filled=True,
          rounded=True,
          max_depth=5)  # Limit depth for readability
plt.show()

In [None]:
def get_business_rules(tree, feature_names, scaler=None):
    """Generate simplified business rules from the decision tree"""
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    rules = []

    def recurse(node, rule):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]

            # If we scaled the data, get approximate original threshold
            if scaler:
                try:
                    col_idx = list(feature_names).index(name)
                    mean = scaler.mean_[col_idx]
                    scale = scaler.scale_[col_idx]
                    original_threshold = threshold * scale + mean
                except:
                    original_threshold = threshold
            else:
                original_threshold = threshold

            left_rule = rule + [f"{name} ≤ {original_threshold:.2f}"]
            recurse(tree_.children_left[node], left_rule)
            right_rule = rule + [f"{name} > {original_threshold:.2f}"]
            recurse(tree_.children_right[node], right_rule)
        else:
            class_prob = tree_.value[node][0]
            total = class_prob.sum()
            churn_prob = class_prob[1]/total
            if churn_prob > 0.5:  # Only show rules that lead to churn
                rules.append({
                    'rule': " AND ".join(rule),
                    'churn_probability': churn_prob
                })

    recurse(0, [])
    return sorted(rules, key=lambda x: x['churn_probability'], reverse=True)

# Get business rules (pass the scaler to get approximate original values)
business_rules = get_business_rules(dt, X_train.columns, scaler)

print("Top Business Rules Leading to Churn:")
for i, rule in enumerate(business_rules[:5], 1):
    print(f"\nRule {i}: {rule['rule']}")
    print(f"Churn probability: {rule['churn_probability']:.2%}")

In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler

with open("encoders.pkl", "rb") as f:
    label_encoders = pickle.load(f)

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("churn_model.pkl", "rb") as f:
    dt = pickle.load(f)

# Identify categorical and numerical columns
categorical_cols = ['Gender']
numerical_cols = ['Age', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']

# Raw input
custom_input = {
    "Age": 30,
    "Gender": "Female",
    "Support Calls": 5,
    "Payment Delay": 18,
    "Total Spend": 932,
    "Last Interaction": 17
}

# Encode categorical inputs using pre-trained encoders
for col in categorical_cols:
    custom_input[col] = label_encoders[col].transform([custom_input[col]])[0]

# Convert to DataFrame
custom_input_df = pd.DataFrame([custom_input])

# Scale numerical features using pre-trained scaler
custom_input_df[numerical_cols] = scaler.transform(custom_input_df[numerical_cols])

# Ensure column order matches training data
custom_input_df = custom_input_df[dt.feature_names_in_]

# Make prediction
predicted_churn = dt.predict(custom_input_df)[0]

# Output result
print("Custom Input:", custom_input)
print("Predicted Churn:", "Yes" if predicted_churn == 1 else "No")


In [None]:
def get_retention_recommendation(customer_data, prediction):
    """
    Provides retention recommendations based on customer profile and churn prediction,
    incorporating insights from decision tree rules.

    Args:
        customer_data (dict): Dictionary containing customer's features
        prediction (int): 1 if churn predicted, 0 otherwise

    Returns:
        dict: Dictionary containing recommendation details
    """
    if prediction == 0:
        return {
            "action": "No immediate action needed",
            "message": "Customer is not predicted to churn. Maintain current engagement."
        }

    recommendations = []

    # Support Calls-based recommendations (from decision tree rules)
    if customer_data['Support Calls'] >= 8:  # High support calls indicate issues
        if customer_data['Support Calls'] >= 15:
            recommendations.append({
                "feature": "VIP Support",
                "description": "Immediate escalation to senior support team with 24hr resolution SLA",
                "rationale": f"Extremely high support calls ({customer_data['Support Calls']}) indicate serious unresolved issues"
            })
        else:
            recommendations.append({
                "feature": "Dedicated Support",
                "description": "Assign a dedicated account manager for immediate issue resolution",
                "rationale": f"Multiple support calls ({customer_data['Support Calls']}) suggest recurring problems"
            })

    # Payment Delay-based recommendations (from decision tree thresholds)
    if customer_data['Payment Delay'] > 10:
        if customer_data['Payment Delay'] > 30:
            recommendations.append({
                "feature": "Payment Relief",
                "description": "Offer payment plan with first month free and reduced installments",
                "rationale": f"Severe payment delay ({customer_data['Payment Delay']} days) indicates financial distress"
            })
        else:
            recommendations.append({
                "feature": "Payment Flexibility",
                "description": "Waive late fees and extend due date by 2 weeks",
                "rationale": f"Payment delay ({customer_data['Payment Delay']} days) may indicate temporary cash flow issues"
            })

    # Total Spend-based recommendations (aligned with decision tree splits)
    if customer_data['Total Spend'] < 1000:  # Approximate threshold from rules
        recommendations.append({
            "feature": "Value Boost",
            "description": "Free upgrade to premium features for 60 days",
            "rationale": f"Mid-range spending (${customer_data['Total Spend']}) suggests opportunity to demonstrate value"
        })
    else:
        recommendations.append({
            "feature": "Elite Retention",
            "description": "Personalized account review with executive team and custom benefits package",
            "rationale": f"High-value customer (${customer_data['Total Spend']}) worth exceptional retention efforts"
        })

    # Age-based recommendations (from decision tree splits)
    if customer_data['Age'] <= 44:
        recommendations.append({
            "feature": "Next-Gen Engagement",
            "description": "Access to beta features and innovation community",
            "rationale": f"Younger customer (age {customer_data['Age']}) may value cutting-edge features"
        })

    # Last Interaction-based recommendations
    if customer_data['Last Interaction'] > 20:  # Days since last interaction
        recommendations.append({
            "feature": "Reactivation Campaign",
            "description": "Personalized We want you back offer with time-sensitive benefits",
            "rationale": f"{customer_data['Last Interaction']} days since last interaction indicates disengagement"
        })

    # Prioritize recommendations by likely impact
    prioritized_recommendations = sorted(
        recommendations,
        key=lambda x: 1 if "Crisis" in x["feature"] else
                      2 if "VIP" in x["feature"] else
                      3 if "Payment" in x["feature"] else 4
    )

    return {
        "action": "Immediate retention action required",
        "message": f"Customer matches {len(recommendations)} key churn indicators",
        "recommendations": prioritized_recommendations
    }

# Enhanced prediction function with better formatting
def predict_churn_with_recommendations(custom_input):
    """
    Enhanced prediction function that includes retention recommendations
    with improved output formatting and explanation
    """
    # Encode and prepare data
    encoded_input = custom_input.copy()
    for col in categorical_cols:
        encoded_input[col] = label_encoders[col].transform([encoded_input[col]])[0]

    custom_input_df = pd.DataFrame([encoded_input])
    custom_input_df[numerical_cols] = scaler.transform(custom_input_df[numerical_cols])
    custom_input_df = custom_input_df[dt.feature_names_in_]

    # Make prediction with probabilities
    predicted_churn = dt.predict(custom_input_df)[0]
    churn_prob = dt.predict_proba(custom_input_df)[0][1]

    # Get retention recommendations
    recommendations = get_retention_recommendation(custom_input, predicted_churn)

    # Prepare detailed output
    result = {
        "customer_data": custom_input,
        "prediction": "Yes" if predicted_churn == 1 else "No",
        "churn_probability": f"{churn_prob:.1%}",
        "retention_advice": recommendations,
        "key_risk_factors": [
            f"High support calls ({custom_input['Support Calls']})" if custom_input['Support Calls'] >= 8 else None,
            f"Payment delay ({custom_input['Payment Delay']} days)" if custom_input['Payment Delay'] > 10 else None,
            f"Recent inactivity ({custom_input['Last Interaction']} days)" if custom_input['Last Interaction'] > 20 else None
        ]
    }
    result["key_risk_factors"] = [x for x in result["key_risk_factors"] if x is not None]

    return result

# Example usage with enhanced output
sample_customer = {
    "Age": 30,
    "Gender": "Female",
    "Support Calls": 5,
    "Payment Delay": 18,
    "Total Spend": 932,
    "Last Interaction": 17
}

result = predict_churn_with_recommendations(sample_customer)

# Improved output formatting
print("\n🚀 Customer Churn Analysis Report")
print("="*60)
print(f"\n🔍 Customer Profile:")
for k, v in result['customer_data'].items():
    print(f"  - {k}: {v}")

print(f"\n⚠️ Churn Prediction: {result['prediction']} (Probability: {result['churn_probability']})")

if result['prediction'] == "Yes":
    print("\n🔑 Key Risk Factors Identified:")
    for factor in result['key_risk_factors']:
        print(f"  - {factor}")

    print("\n🛡️ Recommended Retention Actions:")
    for i, rec in enumerate(result['retention_advice']['recommendations'], 1):
        print(f"\n{i}. {rec['feature']}")
        print(f"   💡 Offer: {rec['description']}")
        print(f"   📊 Rationale: {rec['rationale']}")

    print("\n" + "="*60)
    print(f"💼 Business Impact: {result['retention_advice']['message']}")
    print(f"🚨 Action Required: {result['retention_advice']['action']}")