In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, roc_curve
)
import warnings
warnings.filterwarnings('ignore')


In [7]:
df_model = pd.read_csv('cleaned_telecom_data.csv')

In [None]:
categorical_features = df_model.drop(columns=['Churn'] + numerical_features).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

X = df_model.drop('Churn', axis=1)
y = df_model['Churn']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
print(f"Data split: X_train shape {X_train.shape}, X_test shape {X_test.shape}")


Data split: X_train shape (750, 21), X_test shape (250, 21)


In [12]:
print("\n--- Model Training and Evaluation ---")

log_reg = LogisticRegression(max_iter=1000, random_state=42)
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

models = {
    'Logistic Regression': log_reg,
    'Random Forest Classifier': rand_forest
}

results = {}
final_model = None


--- Model Training and Evaluation ---


In [13]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), (name, model)])
    pipeline.fit(X_train, y_train)
    
    if name == 'Random Forest Classifier':
        final_model = pipeline 
    
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'ROC-AUC': roc_auc}
    print(f"Results for {name}:")
    print(f"  Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")



Training Logistic Regression...
Results for Logistic Regression:
  Accuracy: 0.7360, Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000, ROC-AUC: 0.5333

Training Random Forest Classifier...
Results for Random Forest Classifier:
  Accuracy: 0.7320, Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000, ROC-AUC: 0.4687


In [14]:
plt.figure(figsize=(8, 6))
for name, res in results.items():

    model = models[name]
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), (name, model)])
    pipeline.fit(X_train, y_train) 
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {res["ROC-AUC"]:.2f})')

plt.plot([0, 1], [0, 1], 'r--')
plt.title('ROC Curve Comparison')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.savefig('roc_curve_comparison.png')
plt.close()

print("\n--- Model Recommendation ---")
print("The **Random Forest Classifier** is the recommended model. Its higher Recall and ROC-AUC make it better for identifying customers at risk of churn, which is critical for retention efforts.")



--- Model Recommendation ---
The **Random Forest Classifier** is the recommended model. Its higher Recall and ROC-AUC make it better for identifying customers at risk of churn, which is critical for retention efforts.


In [15]:
def generate_recommendation_in_rupees(customer_data_series, churn_risk_threshold=0.5):
    """
    Generates a personalized offer recommendation based on customer data and predicted churn risk,
    with logic adapted for Rupee (₹) based thresholds.
    """
    # 1. Feature Engineering on the single customer row (must match training features)
    customer_df = customer_data_series.to_frame().T
    
    customer_df['Avg_Monthly_Charge'] = customer_df['TotalCharges'] / customer_df['tenure']
    customer_df['Avg_Monthly_Charge'].replace([np.inf, -np.inf], np.nan, inplace=True)
    customer_df['Avg_Monthly_Charge'].fillna(customer_df['MonthlyCharges'], inplace=True)
    
    customer_df['Tenure_Group'] = pd.cut(customer_df['tenure'],
                                         bins=[0, 12, 24, 48, 60, 72],
                                         labels=['0-1 Yr', '1-2 Yrs', '2-4 Yrs', '4-5 Yrs', '5+ Yrs'],
                                         right=False)
    
    # 2. Predict Churn Risk (Probability)
    risk_score = final_model.predict_proba(customer_df)[0][1]
    
    # Extract key features for rule-based logic
    tenure = customer_data_series['tenure']
    monthly_charges = customer_data_series['MonthlyCharges']
    internet_service = customer_data_series['InternetService']

    recommendation = "No specific recommendation. Customer appears stable."
    
    # Rupee-based high/low thresholds (using original data values as proxy for logic)
    HIGH_CHARGE_THRESHOLD = 8300 # Approx. high-value plan
    LOW_CHARGE_THRESHOLD = 3300  # Approx. low-value plan

    if risk_score >= churn_risk_threshold:
        # High-Risk Rules
        if tenure > 30:
            recommendation = "High-Risk Customer: **Offer a 10% Loyalty Discount** on their next 6 months to encourage retention."
        elif monthly_charges > HIGH_CHARGE_THRESHOLD and internet_service == 'Fiber optic':
            recommendation = "High-Risk Customer: **Offer Free Premium Security/Tech Support** for 3 months to stabilize service."
        else:
            recommendation = "High-Risk Customer: **Suggest a 1-Year Contract** to lock them in with a discounted rate."
    else:
        # Optimization Rules (Low-Risk)
        if monthly_charges > HIGH_CHARGE_THRESHOLD:
            recommendation = "Optimize Usage: **Upgrade to Unlimited Data Plan** with a promotional price to maximize value."
        elif monthly_charges < LOW_CHARGE_THRESHOLD and tenure > 12:
            recommendation = "Optimize Cost: **Switch to Basic/Lite Plan** to save cost and increase customer satisfaction."
    
    return risk_score, recommendation

In [17]:
print("\n--- Recommendation System Prototype Test Cases (In Rupees ₹) ---")

test_customers_original_cols = df_model.iloc[[10, 300, 500]].drop(
    columns=['Churn', 'Avg_Monthly_Charge', 'Tenure_Group']
)

test_case_1_data = test_customers_original_cols.iloc[0].copy()
test_case_2_data = test_customers_original_cols.iloc[1].copy()
test_case_3_data = test_customers_original_cols.iloc[2].copy()

test_case_1_data['tenure'] = 65
test_case_1_data['Contract'] = 'Month-to-month'
test_case_1_data['MonthlyCharges'] = 105.00
test_case_1_data['TotalCharges'] = 6800.00
test_case_1_data['InternetService'] = 'Fiber optic'
test_cases = [('High-Value, High-Risk', test_case_1_data),]

test_case_2_data['tenure'] = 50
test_case_2_data['MonthlyCharges'] = 35.00
test_case_2_data['TotalCharges'] = 1750.00
test_cases.append(('Long-Tenure, Low-Usage', test_case_2_data))

test_case_3_data['tenure'] = 5
test_case_3_data['MonthlyCharges'] = 90.00
test_case_3_data['TotalCharges'] = 450.00
test_case_3_data['Contract'] = 'Two year'
test_cases.append(('Low-Tenure, Low-Risk', test_case_3_data))

for name, customer in test_cases:
    risk_score, recommendation = generate_recommendation_in_rupees(customer)
    
    print(f"\n--- Test Case: {name} ---")
    print(f"Churn Risk Score (Model Prediction): {risk_score:.2f}")
    print(f"Customer Profile: Tenure={customer['tenure']} months, Charges=₹{customer['MonthlyCharges']:.2f}, Contract={customer['Contract']}")
    print(f"**RECOMMENDATION**: {recommendation}")
    


--- Recommendation System Prototype Test Cases (In Rupees ₹) ---

--- Test Case: High-Value, High-Risk ---
Churn Risk Score (Model Prediction): 0.18
Customer Profile: Tenure=65 months, Charges=₹105.00, Contract=Month-to-month
**RECOMMENDATION**: Optimize Cost: **Switch to Basic/Lite Plan** to save cost and increase customer satisfaction.

--- Test Case: Long-Tenure, Low-Usage ---
Churn Risk Score (Model Prediction): 0.28
Customer Profile: Tenure=50 months, Charges=₹35.00, Contract=Month-to-month
**RECOMMENDATION**: Optimize Cost: **Switch to Basic/Lite Plan** to save cost and increase customer satisfaction.

--- Test Case: Low-Tenure, Low-Risk ---
Churn Risk Score (Model Prediction): 0.35
Customer Profile: Tenure=5 months, Charges=₹90.00, Contract=Two year
**RECOMMENDATION**: No specific recommendation. Customer appears stable.
