In [0]:
# Databricks Notebook: Model Training & Registration

import mlflow
import mlflow.sklearn
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Enable automatic logging
mlflow.sklearn.autolog()

# Initialize Spark Session
spark = SparkSession.builder.appName("LoanClassification").getOrCreate()

# Load Databricks Sample Dataset
df = spark.read.parquet("/databricks-datasets/samples/lending_club/parquet/").toPandas()

# Select relevant columns for classification
df = df[["loan_amnt", "funded_amnt", "term", "int_rate", "installment", "annual_inc", "dti", "delinq_2yrs", "loan_status"]]

#Drop NA values
df.dropna(inplace = True)

# Convert categorical columns
df["term"] = df["term"].str.replace(" months", "").astype(int)

# Convert target variable: 'loan_status' (Binary classification: Default (1) or Paid (0))
df["loan_status"] = df["loan_status"].apply(lambda x: 1 if x == "Charged Off" else 0)

# Define features & target
X = df.drop(columns=["loan_status"])
y = df["loan_status"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
with mlflow.start_run():
    model = LogisticRegression(max_iter=500)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    
    # Register model in Databricks Model Registry
    mlflow.sklearn.log_model(model, "loan_default_model")
    
    print(f"✅ Model trained with accuracy: {accuracy:.4f}")

# Register the model under a specific name
model_uri = f"runs:/{mlflow.active_run().info.run_id}/loan_default_model"
mlflow.register_model(model_uri, "loan_default_model")


In [0]:
(df["term"]!=df["term"]).sum()