In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Customer churn/Train.csv')
test = pd.read_csv('/content/drive/MyDrive/Customer churn/Test.csv')

In [None]:
print(f"Train Shape : {train.shape}")
print(f"Test Shape : {test.shape}")

In [None]:
# Concat Two DataFrames
df = pd.concat([train , test] , axis = 0)
df.head()

In [None]:
df.shape

In [None]:
print(df.columns)

In [None]:
df.drop('CustomerID' , axis = 1 , inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include= 'object')

In [None]:
for col in df.columns :
    print(col)
    print(df[col].unique())
    print("*******************")

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace = True)

In [None]:
df.duplicated().sum()

In [None]:
num_col = df.select_dtypes(include='number').columns
cat_col = df.select_dtypes(include='object').columns

In [None]:
plt.figure(figsize=(10, 8))
sns.boxplot(data=df[num_col] , palette='Greens')

plt.title('Boxplot for Outlier Detection')
plt.xticks(rotation = 45)
plt.show()

In [None]:
plt.figure(figsize=(5, 5))

sns.countplot(
    data=df,
    x="Churn" ,width =.4
)

plt.title("Churn Distribution")
plt.xlabel("Churn")
plt.ylabel("Count")

plt.show()

In [None]:
plt.figure(figsize=(15, 10))

for i, col in enumerate(num_col, 1):
    plt.subplot(2 , 4, i)
    sns.histplot(data=df, x=col, kde=True, bins=30 ,palette='Blues')
    plt.title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

for i, col in enumerate(cat_col, 1):
    plt.subplot(1 , 3 , i)
    sns.countplot(data=df, x=col , hue = 'Churn' , palette="Greens")
    plt.title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(data=df, x='Churn', y='Tenure', palette="Blues")

In [None]:
plt.figure(figsize=(6, 6))

corr = df[num_col].corr()
corr_with_target = corr['Churn'].sort_values(ascending= True).to_frame()
sns.heatmap(
    data=corr_with_target,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    cbar=True,
    linewidths=0.5,
    linecolor='white',
    square=True
)

plt.title("Correlation  With Target", fontsize=18)
plt.show()

In [None]:
print("\nStarting Modeling Phase...\n")

# 1. Define Feature Groups for the Pipeline
# (We define these explicitly to ensure the Pipeline maps them correctly)
numeric_features = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls',
                    'Payment Delay', 'Total Spend', 'Last Interaction']
ordinal_features = ['Subscription Type', 'Contract Length']
nominal_features = ['Gender']

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat_nom', OneHotEncoder(drop='first', sparse_output=False), nominal_features),
        ('cat_ord', OrdinalEncoder(categories=[
            ['Basic', 'Standard', 'Premium'],   # Subscription Order
            ['Monthly', 'Quarterly', 'Annual']  # Contract Order
        ]), ordinal_features)
    ],
    verbose_feature_names_out=False
)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        device="cuda",          # This tells XGBoost to use the GPU
        tree_method="hist"      # This is the optimized algorithm for GPU
    )
}

param_grids = {
    "Logistic Regression": {
        "classifier__C": [0.01, 0.1, 1, 10]
    },
    "Decision Tree": {
        "classifier__max_depth": [3, 5, 10, None],
        "classifier__min_samples_split": [2, 5, 10]
    },
    "Random Forest": {
        "classifier__n_estimators": [50, 100, 150],
        "classifier__max_depth": [3, 5, 10, None]
    },
    "XGBoost": {
        "classifier__n_estimators": [50, 100, 150],
        "classifier__learning_rate": [0.01, 0.1, 0.2],
        "classifier__max_depth": [3, 5, 7]
    }
}

In [None]:
results = []

for name, model in models.items():
    print(f"Training {name}...")

    # Pipeline: Preprocess -> Undersample -> Model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('sampler', RandomUnderSampler(random_state=42)),
        ('classifier', model)
    ])

    if name in param_grids:
        grid = GridSearchCV(
            pipeline,
            param_grid=param_grids[name],
            scoring='f1',
            cv=3,
            n_jobs=-1
        )
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_model = pipeline.fit(X_train, y_train)
        best_params = "Default"

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Sort by F1-score (descending) so the best model is at the top
results_df = results_df.sort_values(by="F1-score", ascending=False).reset_index(drop=True)

# Display the table
print("Final Model Evaluation Summary:")
display(results_df)

In [None]:
import shap

preprocessor_step = best_model.named_steps['preprocessor']

xgboost_step = best_model.named_steps['classifier']

X_test_transformed = preprocessor_step.transform(X_test)

feature_names = (
    preprocessor_step.named_transformers_['num'].get_feature_names_out().tolist() +
    preprocessor_step.named_transformers_['cat_nom'].get_feature_names_out().tolist() +
    preprocessor_step.named_transformers_['cat_ord'].get_feature_names_out().tolist()
)

explainer = shap.TreeExplainer(xgboost_step)
shap_values = explainer.shap_values(X_test_transformed)

plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, show=False)
plt.title("SHAP Summary Plot (Feature Impact on Churn)", fontsize=16)
plt.show()

In [None]:
#from google.colab import drive
import joblib
import os


#drive.mount('/content/drive')

folder_path = '/content/drive/MyDrive/Customer churn'
os.makedirs(folder_path, exist_ok=True)

file_path = os.path.join(folder_path, 'Best_Model.pkl')
joblib.dump(best_model, file_path)

In [None]:
from huggingface_hub import login, HfApi

login(token="TOKEN for HF")
api = HfApi()

model_repo = "Ravichandrachilde/Churn-with-SHAP"
api.upload_file(
    path_or_fileobj="/content/drive/MyDrive/Customer churn/Best_Model.pkl",
    path_in_repo="Best_Model.pkl",
    repo_id=model_repo,
    repo_type="model"
)

dataset_repo = "Ravichandrachilde/Churn"
api.upload_file(
    path_or_fileobj="/content/drive/MyDrive/Customer churn/Train.csv",
    path_in_repo="Train.csv",
    repo_id=dataset_repo,
    repo_type="dataset"
)
api.upload_file(
    path_or_fileobj="/content/drive/MyDrive/Customer churn/Test.csv",
    path_in_repo="Test.csv",
    repo_id=dataset_repo,
    repo_type="dataset"
)

In [None]:
# 1. Get the fitted preprocessor from inside the pipeline
# 'best_model' is the last model trained (XGBoost)
fitted_preprocessor = best_model.named_steps['preprocessor']

# 2. Extract feature names from that FITTED preprocessor
feature_names = (
    fitted_preprocessor.named_transformers_['num'].get_feature_names_out().tolist() +
    fitted_preprocessor.named_transformers_['cat_nom'].get_feature_names_out().tolist() +
    fitted_preprocessor.named_transformers_['cat_ord'].get_feature_names_out().tolist()
)

# 3. Extract importance from the classifier step
importances = best_model.named_steps['classifier'].feature_importances_

# 4. Create DataFrame and Plot
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('XGBoost Feature Importance')
plt.show()