In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Change working directory to project root if currently in notebooks
if os.getcwd().endswith("notebooks"):
    os.chdir("..")

In [16]:
file_path = os.path.join("data", "raw", "churn.csv")

In [17]:
def load_data(raw_path, output_path):
    """Load data from a CSV file."""
    if not os.path.exists(raw_path):
        raise FileNotFoundError(f"The file {raw_path} does not exist.")
    
    print(f"Loading data from {raw_path}...")
    data = pd.read_csv(raw_path)

    data.columns = [col.lower().replace(" ", "_") for col in data.columns]

    data['totalcharges'] = pd.to_numeric(data['totalcharges'], errors='coerce')
    data['totalcharges'] = data['totalcharges'].fillna(0)

    if 'churn' in data.columns:
        data['churn'] = data['churn'].map({'Yes': 1, 'No': 0})

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    data.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}.")

In [18]:
input_path = os.path.join("data", "raw", "customer_churn.csv")
output_train_path = os.path.join("data", "processed", "train.csv")
output_test_path = os.path.join("data", "processed", "test.csv")

In [19]:
def process_data(input_path, output_train_path, output_test_path):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"The file {input_path} does not exist.")
    
    print(f"Loading data from {input_path}...")
    data = pd.read_csv(input_path)

    if 'customerid' in data.columns:
        data = data.drop('customerid', axis=1)

    data = pd.get_dummies(data, drop_first=True)

    X = data.drop('churn', axis=1)
    y = data['churn']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    train_data = pd.concat([X_train, y_train], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)

    train_data.to_csv(output_train_path, index=False)
    test_data.to_csv(output_test_path, index=False)

    print(f"Training data saved to {output_train_path}.")

In [22]:
load_data(file_path, input_path)
process_data(input_path, output_train_path, output_test_path)

Loading data from data/raw/churn.csv...
Data saved to data/raw/customer_churn.csv.
Loading data from data/raw/customer_churn.csv...
Training data saved to data/processed/train.csv.


In [23]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import os

In [27]:
def train():
    # 1. Load d·ªØ li·ªáu ƒë√£ chu·∫©n b·ªã ·ªü b∆∞·ªõc tr∆∞·ªõc
    print("üöÄ ƒêang load d·ªØ li·ªáu train/test...")
    train_df = pd.read_csv(output_train_path)
    test_df = pd.read_csv(output_test_path)
    
    # T√°ch Feature (X) v√† Target (y)
    # Target l√† c·ªôt 'churn', c√°c c·ªôt c√≤n l·∫°i l√† Feature
    X_train = train_df.drop('churn', axis=1)
    y_train = train_df['churn']
    X_test = test_df.drop('churn', axis=1)
    y_test = test_df['churn']
    
    # --- B·∫ÆT ƒê·∫¶U MLFLOW ---
    # set_experiment gi√∫p gom nh√≥m c√°c l·∫ßn ch·∫°y l·∫°i cho g·ªçn
    mlflow.set_experiment("churn-prediction-baseline")
    
    with mlflow.start_run():
        # 2. ƒê·ªãnh nghƒ©a tham s·ªë Model
        params = {
            "C": 0.01,           # Regularization strength
            "solver": "liblinear",
            "max_iter": 1000
        }
        
        # Log tham s·ªë l√™n MLflow (ƒë·ªÉ sau n√†y nh·ªõ m√¨nh ƒë√£ ch·ªânh g√¨)
        mlflow.log_params(params)
        
        # 3. Train Model
        print("üß† ƒêang training model...")
        model = LogisticRegression(**params)
        model.fit(X_train, y_train)
        
        # 4. ƒê√°nh gi√° Model
        predictions = model.predict(X_test)
        predict_proba = model.predict_proba(X_test)[:, 1]
        
        acc = accuracy_score(y_test, predictions)
        auc = roc_auc_score(y_test, predict_proba)
        
        print(f"üìä K·∫øt qu·∫£: Accuracy={acc:.4f}, AUC={auc:.4f}")
        
        # Log ch·ªâ s·ªë (Metrics) l√™n MLflow
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("roc_auc", auc)
        
        # 5. L∆∞u Model v√†o MLflow
        # Gi√∫p b·∫°n c√≥ th·ªÉ t·∫£i l·∫°i model n√†y ·ªü b·∫•t k·ª≥ ƒë√¢u
        mlflow.sklearn.log_model(model, "model")
        
        print("‚úÖ ƒê√£ log model v√† metrics l√™n MLflow!")

In [28]:
train()



üöÄ ƒêang load d·ªØ li·ªáu train/test...
üß† ƒêang training model...
üìä K·∫øt qu·∫£: Accuracy=0.8148, AUC=0.8616




‚úÖ ƒê√£ log model v√† metrics l√™n MLflow!


In [None]:
!mlflow ui

  return FileStore(store_uri, artifact_uri)
  return FileStore(store_uri)
[MLflow] Security middleware enabled with default settings (localhost-only). To allow connections from other hosts, use --host 0.0.0.0 and configure --allowed-hosts and --cors-allowed-origins.
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:5000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started parent process [[36m[1m14155[0m]
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:5000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started parent process [[36m[1m14155[0m]
[32mINFO[0m:     Started server process [[36m14159[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m14159[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Started server process [[36m14157[0m]
[32mINFO[0m:     Waiting for application sta