In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('churn_train.csv')

In [None]:
# Display the first few rows of the dataset
print("Dataset Overview:")
print(data.head())

# Display dataset information
print("\nDataset Info:")
print(data.info())

# Step 3: Exploratory Data Analysis (EDA)
# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

# Check basic statistics of numerical features
print("\nBasic Statistics:\n", data.describe())

In [None]:
# Step 1: Data Cleaning and Exploration
# Drop unnecessary columns
data.drop(['rownumber', 'customerid', 'surname'], axis=1, inplace=True)

# Handle missing values
data.fillna(method='ffill', inplace=True)

# Encode categorical variables
data = pd.get_dummies(data, columns=['geography', 'gender'], drop_first=True)

In [None]:
# Step 2: Feature Engineering
# Create interaction-based features
data['balance_to_product_ratio'] = data['balance'] / (data['numofproducts'] + 1)
data['age_to_tenure_ratio'] = data['age'] / (data['tenure'] + 1)

# Define features (X) and target variable (y)
X = data.drop('exited', axis=1)  # 'exited' is the target variable
y = data['exited']


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Model Selection and Tuning
# Initialize the XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}


In [None]:
# Perform GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

# Use the best estimator
best_xgb = grid_search.best_estimator_

In [None]:
# Step 4: Model Evaluation
# Predictions on test set
y_pred = best_xgb.predict(X_test)
y_prob = best_xgb.predict_proba(X_test)[:, 1]

In [None]:
# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")

In [None]:
# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="best")
plt.show()