In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load Data
train = pd.read_csv('/kaggle/input/test-task-for-ds-churn-prediction-2025-06/train.csv')
test = pd.read_csv('/kaggle/input/test-task-for-ds-churn-prediction-2025-06/test.csv')

In [None]:
#EDA
plt.figure(figsize=(6,4))
sns.countplot(x='target_class', data=train)
plt.title('Class Distribution')
plt.xlabel('Churn (1) / Not Churned (0)')
plt.ylabel('Count')
plt.show()

In [None]:
#Split data into two sets (training and validation)
X = train.drop(columns=['Unnamed: 0', 'target_class'])
y = train['target_class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
#Train the XBBoost model
xgb_model = xgb.XGBClassifier(
    random_state=42,
    n_estimators=100,        # fewer trees to avoid timeout
    max_depth=7,             # slightly deeper
    learning_rate=0.05,      # smaller learning rate
    subsample=0.8,           # row dropout to avoid overfitting
    colsample_bytree=0.8,    # feature dropout to avoid overfitting
    scale_pos_weight=4.0,    # to handle class imbalance
    eval_metric='logloss',
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)


In [None]:
#Make predictions for the test data
y_pred_val = xgb_model.predict(X_val)
mcc_score = matthews_corrcoef(y_val, y_pred_val)
print(f"Validation MCC: {mcc_score:.4f}")

In [None]:
# Get feature importances as a dataframe
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10,8))
sns.barplot(data=importances.head(20), x='Importance', y='Feature')
plt.title('Top 20 Feature Importances (XGBoost)')
plt.show()


In [None]:
#Feature importance plot
plt.figure(figsize=(10,6))
xgb.plot_importance(xgb_model, max_num_features=15)
plt.title('Top 15 Feature Importances (XGBoost)')
plt.show()

In [None]:
#Generate test predictions
X_test = test.drop(columns=['Unnamed: 0'])
test_preds = xgb_model.predict(X_test)


In [None]:
#Save the results
submission = pd.DataFrame({
    'id': test['Unnamed: 0'],
    'target_class': test_preds
})
submission.to_csv('/content/churn_predictions.csv', index=False)
print("Done! Results saved to churn_predictions.csv")

In this project, I tackled the problem of predicting customer churn using machine learning on tabular data. My goal was to identify customers who are likely to stop using a product or service within a given period.

I started by performing exploratory data analysis (EDA) to understand the dataset’s structure, class imbalance, and feature characteristics.  Observed that the churn class was highly imbalanced (approximately 80% churned customers vs. 20% retained), which required careful modeling and evaluation strategies.

n_estimators = 100

max_depth = 7

learning_rate = 0.05

scale_pos_weight = 4.0 (to address class imbalance)

subsample = 0.8 and colsample_bytree = 0.8 (to reduce overfitting)

Class Distribution Plot — uses seaborn’s countplot to show the imbalance in churned vs. non-churned customers.

Feature Importance Plot — uses XGBoost’s built-in plot_importance to visualize the top features driving model predictions.

I then used the trained XGBoost model to generate predictions on the test set. These predictions were saved in a submission file (churn_predictions.csv) containing the customer IDs and the predicted churn labels.

Overall, the model I developed provides a solid baseline for churn prediction on this dataset. With further improvements such as hyperparameter tuning (e.g., Optuna) or advanced ensembling techniques, the model’s performance could potentially be enhanced even further.

This project demonstrates the effectiveness of techniques in predicting customer churn from structured tabular data, offering valuable insights for businesses to proactively retain customers and improve service quality.