In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:

df = pd.read_csv('customer_churn_data2(in).csv')

# Optional basic info
basic_info = {
    "shape": df.shape,
    "columns": df.columns.tolist(),
    "null_values": df.isnull().sum().sort_values(ascending=False).head(10),
    "sample_data": df.head(3)
}

In [3]:

data = df.copy()

# 1. Impute missing values
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = data.select_dtypes(include=['object']).columns.tolist()

data[num_cols] = SimpleImputer(strategy='mean').fit_transform(data[num_cols])
data[cat_cols] = SimpleImputer(strategy='most_frequent').fit_transform(data[cat_cols])

# 2. Encode categorical columns
label_enc_cols = ['Gender', 'Location', 'Occupation', 'Income Bracket',
                  'Channel', 'Device Type', 'OS Version', 'Current Plan',
                  'Plan History', 'Payment Mode', 'Survey Feedback']

label_encoders = {}
for col in label_enc_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 3. Tenure feature from date
data['Onboarding Date'] = pd.to_datetime(data['Onboarding Date'], errors='coerce')
data['Customer Tenure (months)'] = (pd.to_datetime("2025-06-01") - data['Onboarding Date']) / np.timedelta64(1, 'm')
data['Customer Tenure (months)'] = data['Customer Tenure (months)'].fillna(data['Customer Tenure (months)'].mean())

# 4. Drop high-cardinality / unused
data = data.drop(columns=['CustomerID', 'Onboarding Date'])

In [4]:

# 5. Feature-label split
X = data.drop('Churned', axis=1)
y = data['Churned']

# Ensure Churned not in features
assert 'Churned' not in X.columns, "Churned column should not be in features"

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 7. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:

# 1. Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
log_preds = log_reg.predict(X_test_scaled)
log_probs = log_reg.predict_proba(X_test_scaled)[:, 1]

# 2. XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)
xgb_probs = xgb.predict_proba(X_test)[:, 1]

# 3. Evaluation
log_report = classification_report(y_test, log_preds, output_dict=True)
xgb_report = classification_report(y_test, xgb_preds, output_dict=True)
log_auc = roc_auc_score(y_test, log_probs)
xgb_auc = roc_auc_score(y_test, xgb_probs)

print({
    "Logistic Regression": {"AUC": log_auc, "Report": log_report},
    "XGBoost": {"AUC": xgb_auc, "Report": xgb_report}
})

{'Logistic Regression': {'AUC': 0.935578802416489, 'Report': {'0.0': {'precision': 0.9182464454976303, 'recall': 0.9639303482587065, 'f1-score': 0.9405339805825242, 'support': 1608.0}, '1.0': {'precision': 0.8141025641025641, 'recall': 0.6479591836734694, 'f1-score': 0.7215909090909091, 'support': 392.0}, 'accuracy': 0.902, 'macro avg': {'precision': 0.8661745048000973, 'recall': 0.8059447659660879, 'f1-score': 0.8310624448367167, 'support': 2000.0}, 'weighted avg': {'precision': 0.8978342447441974, 'recall': 0.902, 'f1-score': 0.8976211385701677, 'support': 2000.0}}}, 'XGBoost': {'AUC': 0.9119993146512336, 'Report': {'0.0': {'precision': 0.8985932004689332, 'recall': 0.9533582089552238, 'f1-score': 0.9251659625829813, 'support': 1608.0}, '1.0': {'precision': 0.7448979591836735, 'recall': 0.5586734693877551, 'f1-score': 0.6384839650145773, 'support': 392.0}, 'accuracy': 0.876, 'macro avg': {'precision': 0.8217455798263034, 'recall': 0.7560158391714895, 'f1-score': 0.7818249637987793, '

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
def assign_risk_tier(prob):
    if prob < 0.4:
        return "Low"
    elif 0.4 <= prob < 0.7:
        return "Medium"
    else:
        return "High"

In [7]:
X_test_df = pd.DataFrame(X_test, columns=X.columns)
test_results = X_test_df.copy()
test_results['Churn_Probability'] = xgb_probs
test_results['Risk_Tier'] = test_results['Churn_Probability'].apply(assign_risk_tier)


In [12]:
def get_top_shap_contributions(shap_values_df, n=3):
    return shap_values_df.apply(lambda row: row.sort_values(ascending=False).head(n).index.tolist(), axis=1)

# Placeholder SHAP for structure (use real SHAP in practice)
shap_df = test_results.drop(columns=['Churn_Probability', 'Risk_Tier']).copy()
shap_df[:] = np.random.normal(0, 0.1, shap_df.shape)  # simulate shap values
shap_df['Top_Features'] = get_top_shap_contributions(shap_df)

# Merge with final results
final_results = test_results.merge(shap_df[['Top_Features']], left_index=True, right_index=True)


 -0.07857434]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  shap_df[:] = np.random.normal(0, 0.1, shap_df.shape)  # simulate shap values
  0.01276183]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  shap_df[:] = np.random.normal(0, 0.1, shap_df.shape)  # simulate shap values
  0.12211127]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  shap_df[:] = np.random.normal(0, 0.1, shap_df.shape)  # simulate shap values
 -0.02326565]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  shap_df[:] = np.random.normal(0, 0.1, shap_df.shape)  # simulate shap values
  0.24343924]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  shap_df[:] = np.random.normal(0, 0.1, shap_df.shape)  # simulate shap values
 -0.03844236]' has dtype incompatible with int32, please explicitly cast to a compatible dtype 

In [13]:
final_results

Unnamed: 0,Age,Gender,Location,Occupation,Income Bracket,Credit Score,Channel,App Logins,Web Logins,Session Duration,...,Payment Mode,Tickets Raised,Resolution Time (hrs),Sentiment Score,NPS Score,Survey Feedback,Customer Tenure (months),Churn_Probability,Risk_Tier,Top_Features
5250,27.0,0,3,3,1,346.0,2,10.0,8.0,21.807725,...,0,4.0,7.775921,0.793708,3.0,2,2368800.0,0.020375,Low,"[Failed Payments, Credit Score, Payment Mode]"
336,25.0,1,0,2,0,787.0,2,12.0,6.0,13.221753,...,1,1.0,23.874194,0.599941,7.0,1,5558400.0,0.112958,Low,"[Age, Credit Score, Income Bracket]"
262,65.0,2,1,0,2,680.0,2,14.0,5.0,10.300886,...,0,2.0,51.361090,0.750811,6.0,1,2345760.0,0.001735,Low,"[Session Duration, Channel, Declined Txns]"
2208,20.0,1,3,2,0,322.0,1,13.0,8.0,1.988410,...,0,3.0,62.414855,1.250448,3.0,1,3906720.0,0.003466,Low,"[Loans Accessed, Gender, Plan History]"
8442,30.0,2,3,0,1,782.0,1,14.0,5.0,5.276530,...,1,3.0,11.739092,0.427671,4.0,2,2098080.0,0.009548,Low,"[Survey Feedback, Payment Mode, Loans Taken]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7846,30.0,2,2,1,2,772.0,1,9.0,4.0,1.610211,...,0,3.0,66.370824,0.009352,7.0,2,2247840.0,0.983392,High,"[Gender, Income Bracket, Overdraft Events]"
8128,21.0,0,1,3,1,796.0,0,8.0,6.0,2.394064,...,0,2.0,5.728108,0.469705,0.0,1,5768640.0,0.004973,Low,"[Plan History, Card Mgmt Accessed, Credit Score]"
9074,34.0,1,1,1,1,372.0,1,11.0,10.0,5.128440,...,0,1.0,13.369935,0.850200,2.0,1,5469120.0,0.009391,Low,"[Subscription Renewals, Overdraft Events, App ..."
8389,24.0,2,2,3,2,607.0,1,18.0,9.0,0.924695,...,0,3.0,93.893274,0.066096,7.0,2,6105600.0,0.000735,Low,"[Resolution Time (hrs), Failed Payments, Payme..."


In [14]:

def generate_recommendation(row):
    tier = row['Risk_Tier']
    features = row['Top_Features']
    
    actions = []

    if tier == 'Low':
        actions.append("Send monthly loyalty points")
    elif tier == 'Medium':
        actions.append("Offer $5 cashback for next renewal")
    elif tier == 'High':
        actions.append("Immediate support call + plan upgrade offer")

    # Feature-based strategy
    if 'Tickets Raised' in features:
        actions.append("Escalate support ticket")
    if 'App Logins' in features:
        actions.append("Send re-engagement email + free month trial")
    if 'Payment Mode_Manual' in features:
        actions.append("Encourage auto-payment setup")
    if 'Credit Score' in features:
        actions.append("Offer credit-building tools")
    if 'Plan Downgrade' in features:
        actions.append("Suggest plan upgrade with bonus benefits")

    return list(set(actions))

# Apply recommendations
final_results['Recommendations'] = final_results.apply(generate_recommendation, axis=1)

# Preview final output
final_results[['Churn_Probability', 'Risk_Tier', 'Top_Features', 'Recommendations']]

Unnamed: 0,Churn_Probability,Risk_Tier,Top_Features,Recommendations
5250,0.020375,Low,"[Failed Payments, Credit Score, Payment Mode]","[Send monthly loyalty points, Offer credit-bui..."
336,0.112958,Low,"[Age, Credit Score, Income Bracket]","[Send monthly loyalty points, Offer credit-bui..."
262,0.001735,Low,"[Session Duration, Channel, Declined Txns]",[Send monthly loyalty points]
2208,0.003466,Low,"[Loans Accessed, Gender, Plan History]",[Send monthly loyalty points]
8442,0.009548,Low,"[Survey Feedback, Payment Mode, Loans Taken]",[Send monthly loyalty points]
...,...,...,...,...
7846,0.983392,High,"[Gender, Income Bracket, Overdraft Events]",[Immediate support call + plan upgrade offer]
8128,0.004973,Low,"[Plan History, Card Mgmt Accessed, Credit Score]","[Send monthly loyalty points, Offer credit-bui..."
9074,0.009391,Low,"[Subscription Renewals, Overdraft Events, App ...","[Send re-engagement email + free month trial, ..."
8389,0.000735,Low,"[Resolution Time (hrs), Failed Payments, Payme...",[Send monthly loyalty points]


In [15]:
import joblib

# Save preprocessing artifacts
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(xgb, 'xgb_model.pkl')

print("✅ Model and preprocessing objects saved successfully.")

✅ Model and preprocessing objects saved successfully.
