### Getting Started with AWS Sagemaker

In [1]:
!python --version

Python 3.12.9


In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

### Load the Table from the Redshift Workgroup and converting to DataFrame

In [3]:
import boto3
import pandas as pd
import time

# Set these values
workgroup_name = 'default-workgroup'
database_name = 'dev'
iam_role = 'arn:aws:iam::436749945793:role/service-role/AmazonSageMaker-ExecutionRole-20250710T140844'
query = "SELECT * FROM salesconvertion;"  # your table

# Redshift Data API client
client = boto3.client('redshift-data')

# Run the query
print(f"🔍 Executing query on workgroup: {workgroup_name}, database: {database_name}")
print(f"🔍 Query: {query}")

response = client.execute_statement(
    WorkgroupName=workgroup_name,
    Database=database_name,
    Sql=query,
    WithEvent=True
)

query_id = response['Id']
print(f"📤 Query submitted. Query ID: {query_id}")

# Wait for it to finish
status = 'STARTED'
while status in ['STARTED', 'SUBMITTED', 'PICKED']:
    status = client.describe_statement(Id=query_id)['Status']
    print(f"⏳ Query status: {status}")
    time.sleep(1)

if status == 'FINISHED':
    result = client.get_statement_result(Id=query_id)

    # Extract column names
    columns = [col['name'] for col in result['ColumnMetadata']]

    # Extract rows
    records = result['Records']
    data = []
    for record in records:
        row = [list(value.values())[0] if value else None for value in record]
        data.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(data, columns=columns)
    print("✅ Query successful! Here's the data:")
    print(df.head())
else:
    print(f"❌ Query failed with status: {status}")


🔍 Executing query on workgroup: default-workgroup, database: dev
🔍 Query: SELECT * FROM salesconvertion;
📤 Query submitted. Query ID: 3ae37d68-b663-4b0e-b2aa-cfc07bce2e1f
⏳ Query status: SUBMITTED
⏳ Query status: FINISHED
✅ Query successful! Here's the data:
   index               city  converted afreecopyofmasteringtheinterview  \
0   True             Select          0                               No   
1   True             Mumbai          1                              Yes   
2   True             Mumbai          0                               No   
3   True             Mumbai          0                               No   
4   True  Thane & Outskirts          0                              Yes   

   lastactivity               leadorigin magazine donotcall search  \
0  Email Opened                      API       No        No     No   
1  Email Opened  Landing Page Submission       No        No     No   
2   Unreachable  Landing Page Submission       No        No     No   
3  Email O

In [4]:
df.head(1)

Unnamed: 0,index,city,converted,afreecopyofmasteringtheinterview,lastactivity,leadorigin,magazine,donotcall,search,totalvisits,...,asymmetriqueprofilescore,whatmattersmosttoyouinchoosingacourse,donotemail,leadquality,leadsource,newspaper,pageviewspervisit,lastnotableactivity,specialization,howdidyouhearaboutxeducation
0,True,Select,0,No,Email Opened,API,No,No,No,5.0,...,15.0,Better Career Prospects,No,,Organic Search,No,2.5,Email Opened,Select,Select


### Cleaning the Data and Building the Preprocessing Pipeline


In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

# Enable pandas output from transformers
set_config(transform_output="pandas")


# Make sure this is defined at the top of your script
binary_cols = [
    'donotemail', 'donotcall', 'search', 'magazine', 'newspaperarticle',
    'xeducationforums', 'newspaper', 'digitaladvertisement',
    'throughrecommendations', 'receivemoreupdatesaboutourcourses',
    'updatemeonsupplychaincontent', 'getupdatesondmcontent',
    'iagreetopaytheamountthroughcheque', 'afreecopyofmasteringtheinterview'
]
# 1. First define all helper functions and classes

def safe_lower_strip(x):
    if pd.isna(x) or str(x).strip().lower() in ['nan', 'select', '']:
        return 'missing'
    return str(x).strip().lower()

def clean_categorical_values(df):
    return df.applymap(safe_lower_strip)

def clean_ordinal_values(x):
    return x.applymap(clean_ordinal_text)

def clean_ordinal_text(v):
    return str(v).strip().lower().replace('01.', '').replace('02.', '').replace('03.', '') if pd.notnull(v) else 'missing'

def fix_common_bad_values(x):
    return x.replace({'nan': 'missing', 'select': 'missing'}, regex=True)

def convert_to_dataframe_with_binary_cols(x):
    return pd.DataFrame(x, columns=binary_cols)

def normalize_column_names(df):
    return df.rename(columns=lambda c: c.lower().replace(' ', '').strip())

class NamedFunctionTransformer(FunctionTransformer):
    """FunctionTransformer that preserves feature names"""
    def get_feature_names_out(self, input_features=None):
        return input_features

# Convert the each column values to lowercase and handles unseen data and Missing data with Model
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    """Label encoding for binary columns with casing and unseen value handling"""
    
    def __init__(self):
        self.le_dict = {}
        self.mode_dict = {}
        self.columns = None

    def fit(self, X, y=None):
        if not hasattr(X, 'columns'):
            raise ValueError("CustomLabelEncoder requires DataFrame input with column names")

        self.columns = X.columns.tolist()
        for col in self.columns:
            le = LabelEncoder()
            # Normalize values: lowercase + strip spaces
            X_normalized = X[col].astype(str).str.lower().str.strip()
            self.le_dict[col] = le.fit(X_normalized)
            self.mode_dict[col] = X_normalized.mode()[0]  # Save mode for fallback
        return self

    def transform(self, X):
        if not hasattr(X, 'columns'):
            if self.columns is None:
                raise ValueError("Column names not available for numpy array input")
            X = pd.DataFrame(X, columns=self.columns)
        
        X_encoded = X.copy()
        for col in self.columns:
            le = self.le_dict[col]
            mode_value = self.mode_dict[col]
            
            # Normalize test input
            X_encoded[col] = X_encoded[col].astype(str).str.lower().str.strip()
            
            # Handle unseen values by replacing with mode
            X_encoded[col] = X_encoded[col].apply(
                lambda val: val if val in le.classes_ else mode_value
            )
            
            # Re-transform after handling unknowns
            X_encoded[col] = le.transform(X_encoded[col])
        return X_encoded

    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)

def get_feature_names(pipeline, numeric_cols, ordinal_cols, binary_cols, categorical_cols):
    col_transformer = pipeline.named_steps['transform']

    ohe = col_transformer.named_transformers_['cat'].named_steps['onehot']
    onehot_feature_names = list(ohe.get_feature_names_out(categorical_cols))

    all_feature_names = numeric_cols + ordinal_cols + binary_cols + onehot_feature_names
    return all_feature_names


# 2. Now define the main pipeline function
def get_full_preprocessing_pipeline(df):
    df = df.copy()
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '')

    numeric_cols = [
        'totalvisits', 'totaltimespentonwebsite', 'pageviewspervisit',
        'asymmetriqueactivityscore', 'asymmetriqueprofilescore'
    ]

    ordinal_cols = ['asymmetriqueactivityindex', 'asymmetriqueprofileindex']
    ordinal_categories = [['missing','low', 'medium', 'high']] * 2

    binary_cols = [
        'donotemail', 'donotcall', 'search', 'magazine', 'newspaperarticle',
        'xeducationforums', 'newspaper', 'digitaladvertisement',
        'throughrecommendations', 'receivemoreupdatesaboutourcourses',
        'updatemeonsupplychaincontent', 'getupdatesondmcontent',
        'iagreetopaytheamountthroughcheque', 'afreecopyofmasteringtheinterview'
    ]

    id_columns = ['prospectid', 'leadnumber','converted']
    existing_ids = [col for col in id_columns if col in df.columns]
    if existing_ids:
        print(f"Dropping ID columns: {existing_ids}")
        df.drop(columns=existing_ids, inplace=True)



    categorical_cols = [
        'country', 'tags', 'leadsource', 'specialization', 'lastactivity',
        'lastnotableactivity', 'howdidyouhearaboutxeducation', 'city',
        'leadprofile', 'whatisyourcurrentoccupation', 'leadquality',
        'leadorigin', 'whatmattersmosttoyouinchoosingacourse'
    ]

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('log', NamedFunctionTransformer(func=np.log1p, validate=False)),
        ('scaler', MinMaxScaler())
    ])

    ordinal_pipeline = Pipeline([
        ('clean_text', NamedFunctionTransformer(clean_ordinal_values)),
        ('fix_common_bad_values', NamedFunctionTransformer(fix_common_bad_values)),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=ordinal_categories))
    ])

    binary_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('to_df', NamedFunctionTransformer(convert_to_dataframe_with_binary_cols)),
        ('label_encoder', CustomLabelEncoder())
    ])

    categorical_pipeline = Pipeline([
        ('clean_values', NamedFunctionTransformer(clean_categorical_values)),
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ])

    column_transformer = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_cols),
            ('ord', ordinal_pipeline, ordinal_cols),
            ('bin', binary_pipeline, binary_cols),
            ('cat', categorical_pipeline, categorical_cols)
        ],
        remainder='drop',
        verbose_feature_names_out=False
    )

    full_pipeline = Pipeline([
        ('clean_column_names', NamedFunctionTransformer(normalize_column_names)),
        ('transform', column_transformer)
    ])

    return full_pipeline, numeric_cols, ordinal_cols, binary_cols, categorical_cols


### Testing the Pipeline with X_train

In [6]:
from sklearn.model_selection import train_test_split


X=df.drop(columns=['converted'])
y=df['converted']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# 1. Create pipeline and get column groupings
pipeline, num_cols, ord_cols, bin_cols, cat_cols = get_full_preprocessing_pipeline(X_train)

# 2. Fit pipeline externally
pipeline.fit(X_train)

### Get the Feature names after transformation 

In [7]:
feature_names = get_feature_names(pipeline, num_cols, ord_cols, bin_cols, cat_cols)
print(feature_names, len(feature_names))

['totalvisits', 'totaltimespentonwebsite', 'pageviewspervisit', 'asymmetriqueactivityscore', 'asymmetriqueprofilescore', 'asymmetriqueactivityindex', 'asymmetriqueprofileindex', 'donotemail', 'donotcall', 'search', 'magazine', 'newspaperarticle', 'xeducationforums', 'newspaper', 'digitaladvertisement', 'throughrecommendations', 'receivemoreupdatesaboutourcourses', 'updatemeonsupplychaincontent', 'getupdatesondmcontent', 'iagreetopaytheamountthroughcheque', 'afreecopyofmasteringtheinterview', 'country_india', 'country_missing', 'tags_interested in other courses', 'tags_lost to eins', 'tags_missing', 'tags_ringing', 'tags_will revert after reading the email', 'leadsource_direct traffic', 'leadsource_google', 'leadsource_olark chat', 'leadsource_organic search', 'specialization_business administration', 'specialization_finance management', 'specialization_human resource management', 'specialization_it projects management', 'specialization_marketing management', 'specialization_media and a

### Model Experimentation with Logging the Metrics, Params and Artifacts into the MLFlow 

In [8]:
# ml/model_experimentation.py

import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
import joblib
import tempfile
from pathlib import Path



def run_classification_experiment(model_name, model, param_grid, X_train, y_train, X_test, y_test, pipeline=None):
           # added extra 
    mlflow.set_tracking_uri("arn:aws:sagemaker:ap-south-1:436749945793:mlflow-tracking-server/MLflowTrackingServer")
    print("Tracking working ") 
    mlflow.set_experiment("Sales_conversion_AWS")
    print("Experiment Created")

    with mlflow.start_run(run_name=model_name):
        grid = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            verbose=1,
            n_jobs=-1
        )
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        mlflow.log_params(grid.best_params_)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1
        })

    
        # Save the fitted pipeline
        if pipeline:
            # Define base artifact directory (shared volume)
            artifact_dir = "data/artifacts"
            os.makedirs(artifact_dir, exist_ok=True)  # Ensure the directory exists

            # Define full path for the artifact
            pipeline_filename = "fitted_pipeline.pkl"
            pipeline_path = os.path.join(artifact_dir, pipeline_filename)

            # Save the artifact (e.g., a preprocessing pipeline)
            joblib.dump(pipeline, pipeline_path)

            # Log the artifact using MLflow
            if os.path.exists(pipeline_path):
                mlflow.log_artifact(pipeline_path, artifact_path="preprocessing_pipeline")
                
                print(f"✅ Logged artifact: {pipeline_path}")
            else:
                print(f"❌ Could not find artifact at: {pipeline_path}")
        X_full = pd.concat([X_train, X_test])
        y_full = pd.concat([y_train, y_test])
        best_model.fit(X_full, y_full)

        # Save full model again
        mlflow.sklearn.log_model(best_model, artifact_path=f"{model_name}_final_model")

        print(f"\n✅ {model_name} Logged to MLflow:")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")

def evaluate_classifiers(X_train, X_test, y_train, y_test, pipeline=None):
    dt_params = {
        'max_depth': [3, 5, None],
        'min_samples_split': [2, 4],
        'min_samples_leaf': [1, 2]
    }

    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'min_samples_split': [2, 3],
    }

    xgb_params = {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.05],
        'max_depth': [3, 5]
    }

    run_classification_experiment("DecisionTree", DecisionTreeClassifier(random_state=42), dt_params, X_train, y_train, X_test, y_test, pipeline)
    run_classification_experiment("RandomForest", RandomForestClassifier(random_state=42), rf_params, X_train, y_train, X_test, y_test, pipeline)
    run_classification_experiment("XGBoost", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), xgb_params, X_train, y_train, X_test, y_test, pipeline)


### Above fitted the Pipeline with Train data and transforming the train and test data 

In [9]:
# Transform the Xtrain and Xtest data 
X_train=pipeline.transform(X_train)
X_test = pipeline.transform(X_test)

### Calling the function for Training and Logging in MLFLow 

In [10]:
# Train + Log
evaluate_classifiers(X_train, X_test, y_train, y_test, pipeline)


Tracking working 
Experiment Created
Fitting 5 folds for each of 12 candidates, totalling 60 fits
✅ Logged artifact: data/artifacts/fitted_pipeline.pkl





✅ DecisionTree Logged to MLflow:
Accuracy: 0.6667, Precision: 0.5000, Recall: 1.0000, F1 Score: 0.6667
🏃 View run DecisionTree at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1/runs/f0b33e88a8fa413d8f85960757a0f5a6
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1
Tracking working 
Experiment Created
Fitting 5 folds for each of 8 candidates, totalling 40 fits
✅ Logged artifact: data/artifacts/fitted_pipeline.pkl





✅ RandomForest Logged to MLflow:
Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1 Score: 1.0000
🏃 View run RandomForest at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1/runs/fe9f094d3a8b4c658a200cfd178d3d1e
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1
Tracking working 
Experiment Created
Fitting 5 folds for each of 8 candidates, totalling 40 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

✅ Logged artifact: data/artifacts/fitted_pipeline.pkl





✅ XGBoost Logged to MLflow:
Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1 Score: 1.0000
🏃 View run XGBoost at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1/runs/6a50002c67134679aeb748860bf4ffed
🧪 View experiment at: https://ap-south-1.experiments.sagemaker.aws/#/experiments/1


### Selecting the Best Model from the MLFlow Experiment and Moving to Model Registry and then to production if I latest model is better than previous version of production model 

In [11]:

import warnings
warnings.filterwarnings('ignore')
import mlflow
from mlflow.tracking import MlflowClient

EXPERIMENT_NAME = "Sales_conversion_AWS"
MODEL_NAME = "BestConversionmodelAWS"
METRIC_TO_OPTIMIZE = "f1_score"  # Can be f1_score, precision, recall etc.

def promote_best_model_to_production():
    mlflow.set_tracking_uri("arn:aws:sagemaker:ap-south-1:436749945793:mlflow-tracking-server/MLflowTrackingServer")
    print("Tracking working ") 

    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        raise ValueError(f"Experiment '{EXPERIMENT_NAME}' does not exist.")
    
    experiment_id = experiment.experiment_id

    print(f"🔍 Searching for best run sorted by {METRIC_TO_OPTIMIZE}...")
    runs = mlflow.search_runs(
        experiment_ids=[experiment_id],
        order_by=[f"metrics.{METRIC_TO_OPTIMIZE} DESC"],
        max_results=1000
    )

    runs = runs[runs[f"metrics.{METRIC_TO_OPTIMIZE}"].notnull()]
    if runs.empty:
        print("❌ No valid runs found.")
        return

    best_run = runs.iloc[0]
    run_id = best_run.run_id
    best_score = best_run[f"metrics.{METRIC_TO_OPTIMIZE}"]

    artifact_path = None
    client = MlflowClient()
    artifacts = client.list_artifacts(run_id)
    for artifact in artifacts:
        if artifact.path.endswith("_model"):
            artifact_path = artifact.path
            break

    if not artifact_path:
        print("❌ No final model artifact found in the best run.")
        return

    print(f"✅ Best run: {run_id} | {METRIC_TO_OPTIMIZE} = {best_score:.4f}")
    print(f"📦 Artifact path: {artifact_path}")

    model_uri = f"runs:/{run_id}/{artifact_path}"
    print("📥 Registering model...")
    model_version = mlflow.register_model(model_uri=model_uri, name=MODEL_NAME)

    import time
    while True:
        model_info = client.get_model_version(name=MODEL_NAME, version=model_version.version)
        if model_info.status == "READY":
            break
        time.sleep(1)

    # Get current production model version (if exists)
    print("🔎 Checking existing Production model...")
    current_production_versions = client.get_latest_versions(name=MODEL_NAME, stages=["Production"])
    
    if current_production_versions:
        current_version = current_production_versions[0]
        current_metrics = mlflow.get_run(current_version.run_id).data.metrics
        current_score = current_metrics.get(METRIC_TO_OPTIMIZE, None)

        print(f"📊 Current Production version: {current_version.version} | Score: {current_score}")
        print(f"📊 New Candidate version: {model_version.version} | Score: {best_score}")

        if current_score is not None and best_score < current_score:
            print("⚠️ New model is not better than current Production model. Skipping promotion.")
            return
        else:
            print("✅ New model is better. Proceeding with promotion.")
    else:
        print("ℹ️ No model in Production yet. Proceeding with first promotion.")

    # Promote the better model to Production
    print(f"🚀 Transitioning model version {model_version.version} to Production...")
    client.transition_model_version_stage(
        name=MODEL_NAME,
        version=model_version.version,
        stage="Production",
        archive_existing_versions=True
    )

    print(f"🎉 Model '{MODEL_NAME}' version {model_version.version} is now in PRODUCTION!")

# Uncomment to run
promote_best_model_to_production()


Tracking working 
🔍 Searching for best run sorted by f1_score...
✅ Best run: 6a50002c67134679aeb748860bf4ffed | f1_score = 1.0000
📦 Artifact path: XGBoost_final_model
📥 Registering model...


Registered model 'BestConversionmodelAWS' already exists. Creating a new version of this model...
2025/07/19 11:44:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BestConversionmodelAWS, version 10
Created version '10' of model 'BestConversionmodelAWS'.


🔎 Checking existing Production model...
📊 Current Production version: 9 | Score: 1.0
📊 New Candidate version: 10 | Score: 1.0
✅ New model is better. Proceeding with promotion.
🚀 Transitioning model version 10 to Production...
🎉 Model 'BestConversionmodelAWS' version 10 is now in PRODUCTION!


### Testing the Saved Pipeline and Model from Production Registry on a single row custom input. It outputs the Prediction value and along with Shap Explaination of predicting that result

In [None]:
import mlflow
import pandas as pd
import joblib
import os
import shutil
import sys
import shap
import numpy as np
from mlflow.tracking import MlflowClient


MODEL_NAME = "BestConversionmodelAWS"
ARTIFACT_SUBPATH_PIPELINE = "preprocessing_pipeline/fitted_pipeline.pkl"

def load_production_model_and_pipeline():
    mlflow.set_tracking_uri("arn:aws:sagemaker:ap-south-1:436749945793:mlflow-tracking-server/MLflowTrackingServer")
    print("Tracking working ") 
    client = MlflowClient()
    versions = client.search_model_versions(f"name='{MODEL_NAME}'")
    prod_versions = [v for v in versions if v.current_stage == "Production"]

    if not prod_versions:
        raise ValueError("❌ No model found in 'Production' stage.")

    prod_model = prod_versions[0]
    run_id = prod_model.run_id
    artifact_uri = f"runs:/{run_id}/{ARTIFACT_SUBPATH_PIPELINE}"

    print(f"✅ Loading model from: {prod_model.source}")
    model = mlflow.sklearn.load_model(f"models:/{MODEL_NAME}/Production")

    local_temp_path = mlflow.artifacts.download_artifacts(artifact_uri)
    target_path = os.path.join(os.getcwd(), "downloaded_fitted_pipeline.pkl")
    shutil.copy(local_temp_path, target_path)

    pipeline = joblib.load(target_path)
    print(f"✅ Pipeline loaded from: {target_path}")
    return model, pipeline

def predict_sample(model, pipeline):
    raw_input = pd.DataFrame([{
        "Prospect ID": "7927b2df-8bba-4d29-b9a2-b6e0beafe620",
        "Lead Number": 660737,
        'Lead Origin': 'API',
        'Lead Source': 'Olark Chat',
        'Do Not Email': 'No',
        'Do Not Call': 'No',
        'TotalVisits': 0,
        'Total Time Spent on Website': 0,
        'Page Views Per Visit': 0,
        'Last Activity': 'Page Visited on Website',
        'Country': None,
        'Specialization': 'Select',
        'How did you hear about X Education': 'Select',
        'What is your current occupation': 'Unemployed',
        'What matters most to you in choosing a course': 'Better Career Prospects',
        'Search': 'No',
        'Magazine': 'No',
        'Newspaper Article': 'No',
        'X Education Forums': 'No',
        'Newspaper': 'No',
        'Digital Advertisement': 'No',
        'Through Recommendations': 'No',
        'Receive More Updates About Our Courses': 'No',
        'Tags': 'Interested in other courses',
        'Lead Quality': 'Low in Relevance',
        'Update me on Supply Chain Content': 'No',
        'Get updates on DM Content': 'No',
        'Lead Profile': 'Select',
        'City': 'Select',
        'Asymmetrique Activity Index': '02.Medium',
        'Asymmetrique Profile Index': '02.Medium',
        'Asymmetrique Activity Score': 15,
        'Asymmetrique Profile Score': 15,
        'I agree to pay the amount through cheque': 'No',
        'A free copy of Mastering The Interview': 'No',
        'Last Notable Activity': 'Modified'
    }])

    # Transform input
    processed_input = pipeline.transform(raw_input)

    # Get feature names after preprocessing
    try:
        feature_names = pipeline.get_feature_names_out()
    except:
        # Handle older sklearn versions
        from sklearn.compose import ColumnTransformer
        if isinstance(pipeline, ColumnTransformer):
            feature_names = pipeline.get_feature_names()
        else:
            feature_names = [f"f{i}" for i in range(processed_input.shape[1])]

    # Convert to DataFrame with named columns
    processed_df = pd.DataFrame(processed_input, columns=feature_names)

    # Prediction
    prediction = model.predict(processed_input)
    print(f"🧠 Prediction: {prediction[0]}")

    # Feature Importance from RandomForest
    importances = model.feature_importances_
    importance_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)

    print("\n🎯 Top Features by RandomForest Importance:\n")
    print(importance_df.head(10))

    # SHAP Explanation
    explainer = shap.Explainer(model)
    shap_values = explainer(processed_df)
    feature_names = processed_df.columns
    input_values = processed_df.iloc[0].values
    shap_vals = shap_values.values[0].flatten()
    
    # Zip and create list of tuples: (feature, input_value, shap_value)
    shap_result = list(zip(feature_names, input_values, shap_vals))
    
    # Sort by absolute SHAP value in descending order
    shap_result_sorted = sorted(shap_result, key=lambda x: abs(x[2]), reverse=True)
    
    # Display top 10
    print("\n🔝 Top 10 SHAP Features:")
    for feat, val, shap_val in shap_result_sorted[:10]:
        print(f"{feat}: input={val} ➜ SHAP={shap_val:.4f}")
    #for feat, val, shap_val in zip(feature_names, processed_df.iloc[0], shap_values.values[0]):
        #print(f"{feat}: input={val} ➜ SHAP={shap_val[0]:.4f}")


if __name__ == "__main__":
    model, pipeline = load_production_model_and_pipeline()
    predict_sample(model, pipeline)


In [None]:
import pandas as pd
import mlflow
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, DataQualityPreset, TargetDriftPreset
from sklearn.model_selection import train_test_split


mlflow.set_tracking_uri("arn:aws:sagemaker:ap-south-1:436749945793:mlflow-tracking-server/MLflowTrackingServer")
print("Tracking working ")
# === Set experiment name once ===
mlflow.set_experiment("Lead Data Drift Analysis")

# === Function to run and log drift analysis ===
def analyze_data_drift(final_df):
    # Step 1: Split features/target
    X = final_df.drop(columns=["converted"])
    y = final_df["converted"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert y to DataFrame
    y_train_df = pd.DataFrame({"converted": y_train})
    y_test_df = pd.DataFrame({"converted": y_test})

    # Step 2: Run and log reports
    def log_data_drift_run(run_name, reference_df, current_df, report_file):
        report = Report(metrics=[
         DataQualityPreset(),
            DataDriftPreset(),
            TargetDriftPreset()
        ])
        report.run(reference_data=reference_df, current_data=current_df)
        report.save_html(report_file)
        report_dict = report.as_dict()

        with mlflow.start_run(run_name=run_name):
            for metric in report_dict.get("metrics", []):
                if metric.get("metric") == "DataDriftTable":
                    for feature, result in metric["result"].get("drift_by_columns", {}).items():
                        mlflow.log_metric(f"{feature}_drift_score", result.get("drift_score"))
                        mlflow.log_param(f"{feature}_stat_test", result.get("stat_test_name"))
                        mlflow.log_param(f"{feature}_drift_detected", result.get("drift_detected"))
            mlflow.log_artifact(report_file, artifact_path="evidently_report")
            print(f"✅ Logged {run_name} → Report: {report_file}")

    # Run drift checks
    log_data_drift_run(
        run_name="X_train_vs_X_test",
        reference_df=X_train,
        current_df=X_test,
        report_file="report_X_train_vs_X_test.html"
    )

    log_data_drift_run(
        run_name="y_train_vs_y_test",
        reference_df=y_train_df,
        current_df=y_test_df,
        report_file="report_y_train_vs_y_test.html"
    )

    print("✅ Data drift analysis complete.")

analyze_data_drift(df)

In [None]:
import evidently
print(evidently.__version__)
print(evidently.__file__)
