In [1]:
import os

ROOT = "CHURN_PREDICTION"

folders = [
    f"{ROOT}/data",
    f"{ROOT}/model"
]

files = [
    f"{ROOT}/train.py",
    f"{ROOT}/requirements.txt",
    f"{ROOT}/README.md"
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

for file in files:
    open(file, "a").close()

print("Churn project structure created.")

Churn project structure created.


In [3]:
import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# ======================
# AUTO-DETECT DATA PATH
# ======================
if os.path.exists("data/WA_Fn-UseC_-Telco-Customer-Churn.csv"):
    DATA_PATH = "data/WA_Fn-UseC_-Telco-Customer-Churn.csv"
else:
    DATA_PATH = "CHURN_PREDICTION/data/WA_Fn-UseC_-Telco-Customer-Churn.csv"

print("Using data from:", DATA_PATH)

# ======================
# LOAD DATA
# ======================
df = pd.read_csv(DATA_PATH)
print("Dataset shape:", df.shape)

# ======================
# CLEAN DATA
# ======================
df = df.drop(columns=["customerID"])

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna()

# ======================
# TARGET
# ======================
y = df["Churn"]
X = df.drop(columns=["Churn"])

# ======================
# COLUMN TYPES
# ======================
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(exclude=["object"]).columns

# ======================
# PREPROCESSOR
# ======================
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
])

# ======================
# PIPELINE
# ======================
pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=1000))
])

# ======================
# SPLIT
# ======================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ======================
# TRAIN
# ======================
print("\nTraining model...")
pipeline.fit(X_train, y_train)

# ======================
# EVALUATE
# ======================
print("\nEvaluating model...")
preds = pipeline.predict(X_test)

acc = accuracy_score(y_test, preds)
print(f"\nAccuracy: {acc:.4f}\n")
print(classification_report(y_test, preds))

# ======================
# SAVE
# ======================
os.makedirs("model", exist_ok=True)
joblib.dump(pipeline, "model/model.pkl")

print("\nModel saved to model/model.pkl")

Using data from: CHURN_PREDICTION/data/WA_Fn-UseC_-Telco-Customer-Churn.csv
Dataset shape: (7043, 21)

Training model...

Evaluating model...

Accuracy: 0.8038

              precision    recall  f1-score   support

          No       0.85      0.89      0.87      1033
         Yes       0.65      0.57      0.61       374

    accuracy                           0.80      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.80      0.80      1407


Model saved to model/model.pkl
