# Logistic Regression Binary Classification

- Logistic Regression Visualizer: https://regression-logistic.streamlit.app/
- Churn Prediction App: https://churnr.streamlit.app/

### 1. Loading packages and data

In [2]:
# Load packages 
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    RocCurveDisplay,
    roc_auc_score,
    roc_curve,
    classification_report
)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import loguniform, uniform

import warnings
warnings.filterwarnings('ignore')

# For Local WANDB
os.environ["WANDB_AGENT_DISABLE_FLAPPING"] = "true"




In [3]:
# Load dataset
df = pd.read_csv("customer_churn.csv")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

Dataset shape: (3150, 9)
Columns: ['Call  Failure', 'Complains', 'Subscription  Length', 'Charge  Amount', 'Seconds of Use', 'Frequency of SMS', 'Distinct Called Numbers', 'Age', 'Churn']


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of SMS,Distinct Called Numbers,Age,Churn
0,8,0,38,0,4370,5,17,30,0
1,0,0,39,0,318,7,4,25,0
2,10,0,37,0,2453,359,24,30,0
3,10,0,38,0,4198,1,35,15,0
4,3,0,38,0,2393,2,33,15,0


In [4]:
# Split features/target
X = df.drop(columns=["Churn"])
y = df["Churn"]

In [5]:
# Fix feature names - remove extra spaces and standardize
X.columns = X.columns.str.strip().str.replace('  ', ' ')
print("Feature names after cleaning:", X.columns.tolist())

Feature names after cleaning: ['Call Failure', 'Complains', 'Subscription Length', 'Charge Amount', 'Seconds of Use', 'Frequency of SMS', 'Distinct Called Numbers', 'Age']


### Initialize W&B 
Initialize a new run by calling wandb.init()

In [None]:
import wandb
wandb.login()  # You’ll paste your API key once
PROJECT = "mlops-project-v1"

[34m[1mwandb[0m: Currently logged in as: [33msagar-chhabriya[0m ([33msagar-chhabriya-sukkur-iba-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


#### Log dataset as W&B Artifact


In [7]:
dataset_artifact = wandb.Artifact(
    name="customer-churn-dataset",
    type="dataset",
    description="Telecom churn dataset for classification project",
    metadata={
        "rows": df.shape[0],
        "features": list(X.columns),
        "target": "Churn",
        "missing_values": df.isnull().sum().to_dict(),
        "data_types": df.dtypes.apply(lambda x: str(x)).to_dict(),
        "source": "customer_churn.csv"
    }
)
dataset_artifact.add_file("customer_churn.csv")
run = wandb.init(project=PROJECT, job_type="data-versioning") # return_previous for reinit
wandb.log_artifact(dataset_artifact)
run.finish()


### Baseline Model

In [8]:
run = wandb.init(project=PROJECT, job_type="baseline")

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train baseline model
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)

# Evaluate
y_pred = baseline_model.predict(X_test)
y_prob = baseline_model.predict_proba(X_test)[:, 1]

metrics = {
    "baseline_accuracy": accuracy_score(y_test, y_pred),
    "baseline_roc_auc": roc_auc_score(y_test, y_prob),
    "baseline_precision": precision_score(y_test, y_pred),
    "baseline_recall": recall_score(y_test, y_pred)
}
wandb.log(metrics)

print("Baseline Performance")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

# Save and log as model artifact
os.makedirs("models", exist_ok=True)
joblib.dump(baseline_model, "models/logistic_regression_baseline.pkl")
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(list(X.columns), "models/feature_names.pkl")

baseline_artifact = wandb.Artifact(
    name="logistic-regression-baseline",
    type="model",
    description="Baseline Logistic Regression for churn prediction",
    metadata=metrics
)
baseline_artifact.add_dir("models")
wandb.log_artifact(baseline_artifact)

run.finish()


[34m[1mwandb[0m: Adding directory to artifact (models)... Done. 0.1s


Baseline Performance
baseline_accuracy: 0.9032
baseline_roc_auc: 0.9052
baseline_precision: 0.9130
baseline_recall: 0.4242


0,1
baseline_accuracy,▁
baseline_precision,▁
baseline_recall,▁
baseline_roc_auc,▁

0,1
baseline_accuracy,0.90317
baseline_precision,0.91304
baseline_recall,0.42424
baseline_roc_auc,0.90521


### Define Sweep Configuration

In [9]:
sweep_config = {
    "method": "random",
    "metric": {"name": "val_roc_auc", "goal": "maximize"},
    "parameters": {
        "C": {"distribution": "log_uniform_values", "min": 1e-4, "max": 1e4},
        "penalty": {"values": [ "elasticnet"]}, # "l1", "l2"
        "solver": {"values": ["saga"]}, # "liblinear", "lbfgs"
        "max_iter": {"values": [100, 200, 500]},
        "l1_ratio": {"distribution": "uniform", "min": 0.0, "max": 1.0}
    }
}
sweep_id = wandb.sweep(sweep_config, project=PROJECT)
print(f"Created Sweep: {sweep_id}")


Create sweep with ID: 1a4hnfg7
Sweep URL: https://wandb.ai/sagar-chhabriya-sukkur-iba-university/mlops-project-v1/sweeps/1a4hnfg7
Created Sweep: 1a4hnfg7


### Sweep Training Function

In [10]:
# For Local Env: VSCode, etc
os.environ["WANDB_NOTEBOOK_NAME"] = "ignore"       # Disable IPython display mode
os.environ["WANDB_SILENT"] = "true"                # Suppress HTML output

def sweep_train():
    run = wandb.init(project=PROJECT)
    config = wandb.config

    # Split again (for standalone sweep runs)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = LogisticRegression(
        C=config.C,
        penalty=config.penalty,
        solver=config.solver,
        max_iter=config.max_iter,
        l1_ratio=config.l1_ratio if "elasticnet" in config.penalty else None,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    wandb.log({
        "val_accuracy": accuracy_score(y_test, y_pred),
        "val_roc_auc": roc_auc_score(y_test, y_prob),
        "val_precision": precision_score(y_test, y_pred),
        "val_recall": recall_score(y_test, y_pred)
    })


### Run the Sweep

In [11]:
wandb.agent(sweep_id, function=sweep_train, count=5)

[34m[1mwandb[0m: Agent Starting Run: p772cqv1 with config:
[34m[1mwandb[0m: 	C: 378.302698965983
[34m[1mwandb[0m: 	l1_ratio: 0.2624570977527203
[34m[1mwandb[0m: 	max_iter: 500
[34m[1mwandb[0m: 	penalty: elasticnet
[34m[1mwandb[0m: 	solver: saga


### Log Best Model (After Sweep)

In [12]:
run = wandb.init(project=PROJECT, job_type="model-tuning")

# Assume you've retrained locally with best params (from W&B summary)

best_model = LogisticRegression(
    C=378.30,
    penalty="elasticnet",
    solver="saga",
    l1_ratio=0.2624,
    max_iter=500,
    random_state=42
)
best_model.fit(X_train, y_train)

# Save and log tuned model
joblib.dump(best_model, "models/logistic_regression_tuned.pkl")

tuned_artifact = wandb.Artifact(
    name="logistic-regression-tuned",
    type="model",
    description="Tuned Logistic Regression via W&B Sweep",
    metadata={"source_sweep": sweep_id}
)
tuned_artifact.add_dir("models")
wandb.log_artifact(tuned_artifact)
wandb.run.link_artifact(tuned_artifact, "mlops-capstone/logistic-regression-model:latest")

run.finish()
