In [0]:
# Databricks Notebook: Model Training & Registration

import mlflow
import mlflow.sklearn
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from mlflow.models import infer_signature  
# Step 1: Initialize SparkSession
spark = SparkSession.builder.appName("LoanClassification").getOrCreate()

# Step 2: Load Databricks Sample Dataset (Lending Club)
df = spark.read.parquet("/databricks-datasets/samples/lending_club/parquet/")

# Step 3: Save Data to Table
# Save the data to a Databricks table for future use
df.write.format("delta").mode("overwrite").saveAsTable("loan_data")

# Step 4: Load the Data Back from the Table
df_table = spark.read.table("loan_data")

# Step 5: Convert the DataFrame to Pandas for Feature Engineering
df_pandas = df_table.toPandas()

# Step 6: Feature Engineering
# Remove unnecessary columns and handle missing values
df_pandas = df_pandas.dropna(subset=['loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'loan_status'])

# Select relevant columns for classification
df_pandas = df_pandas[["loan_amnt", "funded_amnt", "term", "int_rate", "installment", "annual_inc", "dti", "delinq_2yrs", "loan_status"]]

# Convert categorical columns (e.g., term)
df_pandas["term"] = df_pandas["term"].str.replace(" months", "").astype(int)

# Step 6: Convert string to float
df_pandas['int_rate'] = df_pandas['int_rate'].str.replace('%', '').astype(float)

# Create target variable (binary classification: 1 for default, 0 for paid)
df_pandas["loan_status"] = df_pandas["loan_status"].apply(lambda x: 1 if x == "Charged Off" else 0)

# Define features and target variable
X = df_pandas.drop(columns=["loan_status"])
y = df_pandas["loan_status"]

# Step 7: Data Preprocessing (Normalization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 8: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Workaround to set the registry URI manually
#mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"
mlflow.set_registry_uri("databricks")

# Step 9: Model Training (Logistic Regression)
run_id = ''
with mlflow.start_run():
    model = LogisticRegression(max_iter=500, random_state=42)
    model.fit(X_train, y_train)

    # Step 10: Evaluate the Model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model to MLflow's Model Registry with the signature
    mlflow.sklearn.log_model(model, "loan_model5", signature=signature)

    print(f"✅ Model trained with accuracy: {accuracy:.4f}")

    # Step 11: Register the Model (if desired)
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/loan_model5"
    
    run_id = mlflow.active_run().info.run_id
    mlflow.end_run()

In [0]:
# Workaround to set the registry URI manually
#mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"
mlflow.set_registry_uri("databricks")

# Step 9: Model Training (Logistic Regression)
run_id = ''
with mlflow.start_run():
    model = LogisticRegression(max_iter=500, random_state=42)
    model.fit(X_train, y_train)

    # Step 10: Evaluate the Model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model to MLflow's Model Registry with the signature
    mlflow.sklearn.log_model(model, "loan_model7", signature=signature)

    print(f"✅ Model trained with accuracy: {accuracy:.4f}")

    # Step 11: Register the Model (if desired)
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/loan_model7"
    
    run_id = mlflow.active_run().info.run_id
    mlflow.end_run()

In [0]:
# mlflow.register_model(model_uri, "loan_model5")

In [0]:
dbfs:/databricks/mlflow-tracking/e8e3a7829a0a4ab684162d1e8632903c/c3f7dcc2a134473c8f35be0cab06575c/artifacts/loan_model5

In [0]:
!pip install dagshub

In [0]:
import dagshub
dagshub.init(repo_owner='SagarBansal7', repo_name='ML_Clf_Model', mlflow=True)

In [0]:
import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'SagarBansal7' # 
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'fdc73b6cc7d7bf7fb2aaf90496a67f85e4a35993' # 
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/api/v1/repo-buckets/s3/SagarBansal7'

In [0]:
run_id = ''

mlflow.set_registry_uri(os.environ['MLFLOW_TRACKING_URI'])
with mlflow.start_run():
    model = LogisticRegression(max_iter=500, random_state=42)
    model.fit(X_train, y_train)

    # Step 10: Evaluate the Model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model to MLflow's Model Registry with the signature
    mlflow.sklearn.log_model(model, "loan_model6", signature=signature)

    print(f"✅ Model trained with accuracy: {accuracy:.4f}")

    # Step 11: Register the Model (if desired)
    model_uri = os.environ['MLFLOW_TRACKING_URI']
    
    run_id = mlflow.active_run().info.run_id
    mlflow.end_run()

In [0]:
model_name = 'loan_model6'
#run_id=input('Please type RunID')
model_uri = 'dbfs:/databricks/mlflow-tracking/e8e3a7829a0a4ab684162d1e8632903c/ff2edb52b2ea4342a7cfc94386b46e03/artifacts/loan_model6'
#f'runs:/{run_id}/model_name'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

In [0]:
model_version = 1
model_uri = f"models:/{model_name}/{model_version}"

loaded_model = mlflow.xgboost.load_model(model_uri)
y_pred = loaded_model.predict(X_test)
y_pred[:4]

In [0]:
mlflow.active_run()