In [8]:
# ============================================================
# SCRIPT 2: xgboost_trainer.py
# Train XGBoost on rule-generated segments
# ============================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from xgboost import XGBClassifier
import joblib

# ------------------------------------------------------------
# 1. LOAD LABELED CSV FROM SCRIPT 1
# ------------------------------------------------------------
df = pd.read_csv("/kaggle/input/hcl-ml/final.csv")

# ------------------------------------------------------------
# 2. REMOVE LEAKAGE COLUMNS
# ------------------------------------------------------------
leakage_cols = [
    "segment", "segment_encoded",
    "Churn Label", "Churn Value", "Churn Score",
    "Churn Category", "Churn Reason", "Customer Status",
    "Customer ID", "Service ID", "Status ID", "Location ID",
    "ID", "Lat Long"
]

X = df.drop(columns=[c for c in leakage_cols if c in df.columns])
y = df["segment"]

# ------------------------------------------------------------
# 3. ENCODE TARGET LABELS (0/1/2)
# ------------------------------------------------------------
le = LabelEncoder()
y = le.fit_transform(y)

print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# ------------------------------------------------------------
# 4. COLUMN TYPES
# ------------------------------------------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()

# ------------------------------------------------------------
# 5. PREPROCESSING PIPELINE
# ------------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
    ]
)

# ------------------------------------------------------------
# 6. XGBOOST MODEL
# ------------------------------------------------------------
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss"
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_model)
])

# ------------------------------------------------------------
# 7. TRAIN/TEST SPLIT
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

# ------------------------------------------------------------
# 8. TRAIN MODEL
# ------------------------------------------------------------
pipeline.fit(X_train, y_train)

# ------------------------------------------------------------
# 9. PREDICT & EVALUATE
# ------------------------------------------------------------
y_pred = pipeline.predict(X_test)

print("\n=========== MODEL SCORES ===========")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
print("Weighted F1:", f1_score(y_test, y_pred, average='weighted'))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ------------------------------------------------------------
# 10. SAVE MODEL + LABEL ENCODER
# ------------------------------------------------------------
joblib.dump(pipeline, "/kaggle/working/telco_xgb_model.pkl")
joblib.dump(le, "/kaggle/working/label_encoder.pkl")

print("\nModel saved successfully!")


Label mapping: {'critical': 0, 'habitual_defaulter': 1, 'occasional_defaulter': 2}

Accuracy: 0.8580352072685974
Macro F1: 0.8176682326367466
Weighted F1: 0.8529139192032125

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.97      0.91      1015
           1       0.91      0.71      0.80       429
           2       0.81      0.69      0.74       317

    accuracy                           0.86      1761
   macro avg       0.86      0.79      0.82      1761
weighted avg       0.86      0.86      0.85      1761


Confusion Matrix:
[[987  21   7]
 [ 79 305  45]
 [ 89   9 219]]

Model saved successfully!
