In [None]:
df = pd.read_csv("archive/train.csv" )

# Map target to binary
if "churn" not in df.columns:
    raise ValueError("Expected a 'churn' column in the training CSV.")
df["churn_binary"] = df["churn"].map({"yes": 1, "no": 0})

X = df.drop(columns=["churn", "churn_binary"])
y = df["churn_binary"]

display(df.head())
print("Shape:", df.shape)
print("Target positive rate:", y.mean().round(4))


In [2]:
#train split 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("Categorical:", cat_cols)
print("Numerical:", num_cols)

In [None]:
#preprocessing
numeric_transformer = Pipeline([("scaler", StandardScaler())])
categorical_transformer = Pipeline([("ohe", OneHotEncoder(handle_unknown="ignore"))])

preprocess = ColumnTransformer(
    [("num", numeric_transformer, num_cols),
     ("cat", categorical_transformer, cat_cols)]
)


In [None]:
#training
pipe_lr = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300, class_weight="balanced", random_state=42
    ))
])

pipe_lr.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)
print("Models trained.")


In [None]:
#generate best model
proba_lr = pipe_lr.predict_proba(X_test)[:, 1]
proba_rf = pipe_rf.predict_proba(X_test)[:, 1]

auc_lr = roc_auc_score(y_test, proba_lr)
auc_rf = roc_auc_score(y_test, proba_rf)

if auc_rf >= auc_lr:
    best_name, best_model, y_proba = "RandomForest", pipe_rf, proba_rf
else:
    best_name, best_model, y_proba = "LogisticRegression", pipe_lr, proba_lr

print(f"Best model: {best_name} | AUC_LR={auc_lr:.3f} | AUC_RF={auc_rf:.3f}")


In [None]:
#evaluation 0.5
y_pred = (y_proba >= 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_proba)
avg_prec = average_precision_score(y_test, y_proba)

print(f"Accuracy={acc:.3f}, Precision={prec:.3f}, Recall={rec:.3f}, "
      f"F1={f1:.3f}, ROC-AUC={roc_auc:.3f}, PR-AUC(AP)={avg_prec:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
#generate graphs
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"{best_name} (AUC={roc_auc:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC Curve"); plt.legend(loc="lower right"); plt.tight_layout()
plt.show()

pr_prec, pr_rec, _ = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(pr_rec, pr_prec, label=f"{best_name} (AP={avg_prec:.3f})")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("Precision-Recall Curve"); plt.legend(loc="lower left"); plt.tight_layout()
plt.show()


In [None]:
#save model
MODEL_PATH = "churn_model_pipeline.joblib"
import joblib
joblib.dump(best_model, MODEL_PATH)
print(f"Model saved -> {MODEL_PATH}")

In [None]:
#try model on test.csv
NEW_DATA_CSV = "archive/test.csv"
new_data = pd.read_csv(NEW_DATA_CSV)
proba_new = best_model.predict_proba(new_data)[:, 1]
pred_new = (proba_new >= 0.5).astype(int)
out = new_data.copy()
out["churn_proba"] = proba_new
out["churn_pred"] = pred_new
out.to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv")

In [None]:
thresholds = np.linspace(0.1, 0.9, 17)
best_thr, best_f1 = 0.5, -1.0
for thr in thresholds:
    yp = (y_proba >= thr).astype(int)
    f1_thr = f1_score(y_test, yp, zero_division=0)
    if f1_thr > best_f1:
        best_f1, best_thr = f1_thr, thr
print(f"Best F1={best_f1:.3f} at threshold={best_thr:.2f}")
