In [2]:
import pandas as pd
from sqlalchemy import create_engine
import urllib.parse
import os
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from ../.env
load_dotenv(dotenv_path=Path("..") / ".env")

def load_data_from_postgres(
    table_name,
    db_user=os.getenv("DB_USER"),
    db_password=os.getenv("DB_PASSWORD"),
    db_host=os.getenv("DB_HOST"),
    db_port=os.getenv("DB_PORT"),
    db_name=os.getenv("DB_NAME")
):
    """
    Load the dataset from a PostgreSQL table and validate its structure.

    Args:
        table_name (str): Name of the table in PostgreSQL.

    Returns:
        pandas.DataFrame: Loaded dataset.

    Raises:
        ValueError: If required columns are missing or the dataset is empty.
    """
    # Encode password to be URL-safe
    encoded_password = urllib.parse.quote_plus(db_password)

    # Create connection string
    connection_str = f'postgresql+psycopg2://{db_user}:{encoded_password}@{db_host}:{db_port}/{db_name}'
    engine = create_engine(connection_str)

    # Load data
    df = pd.read_sql_table(table_name, con=engine)

    # Check if dataset is empty
    if df.empty:
        raise ValueError("Dataset is empty")

    # Required columns for Lead Scoring project
    required_columns = [
        'Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source', 'Do Not Email',
        'Do Not Call', 'Converted', 'TotalVisits', 'Total Time Spent on Website',
        'Page Views Per Visit', 'Last Activity', 'Country', 'Specialization',
        'How did you hear about X Education', 'What is your current occupation',
        'What matters most to you in choosing a course', 'Search', 'Magazine',
        'Newspaper Article', 'X Education Forums', 'Newspaper',
        'Digital Advertisement', 'Through Recommendations',
        'Receive More Updates About Our Courses', 'Tags', 'Lead Quality',
        'Update me on Supply Chain Content', 'Get updates on DM Content',
        'Lead Profile', 'City', 'Asymmetrique Activity Index',
        'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
        'Asymmetrique Profile Score', 'I agree to pay the amount through cheque',
        'A free copy of Mastering The Interview', 'Last Notable Activity'
    ]

    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

    print("✅ Dataset loaded successfully from PostgreSQL with shape:", df.shape)
    print(df.head())
    return df


if __name__ == "__main__":

    df = load_data_from_postgres(table_name='lead_scoring_data')
    

✅ Dataset loaded successfully from PostgreSQL with shape: (9240, 37)
                            Prospect ID  Lead Number              Lead Origin  \
0  7927b2df-8bba-4d29-b9a2-b6e0beafe620       660737                      API   
1  2a272436-5132-4136-86fa-dcc88c88f482       660728                      API   
2  8cc8c611-a219-4f35-ad23-fdfd2656bd8a       660727  Landing Page Submission   
3  0cc2df48-7cf4-4e39-9de9-19797f9b38cc       660719  Landing Page Submission   
4  3256f628-e534-4826-9d63-4a8b88782852       660681  Landing Page Submission   

      Lead Source Do Not Email Do Not Call  Converted  TotalVisits  \
0      Olark Chat           No          No          0          0.0   
1  Organic Search           No          No          0          5.0   
2  Direct Traffic           No          No          1          2.0   
3  Direct Traffic           No          No          0          1.0   
4          Google           No          No          1          2.0   

   Total Time Spent on 

In [16]:
# Applying the techniques observed in EDA

def clean_data(df):

    # Prospect ID and Lead Number adds no additional information so drop these columns
    df = df.drop(columns=['Prospect ID', 'Lead Number'], errors='ignore')

    # Removing as these contains constant value "False"
    df = df.drop(columns=["Magazine", "Receive More Updates About Our Courses", "Update me on Supply Chain Content", "Get updates on DM Content","I agree to pay the amount through cheque"], errors='ignore')
    
    # Check for duplicate records
    print("\nDuplicate Records:", df.duplicated().sum())

    # if duplicate rows found then drop
    df = df.drop_duplicates()

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include='object').columns.tolist()

    # Filling the missing null values and label encoding "Asymmetrique Profile Index" and "Asymmetrique Activity Index"
    # and dropping unneccesary columns
    drop_cols = [
        'Newspaper', 'Newspaper Article',
        'X Education Forums', 'Search', 'Through Recommendations'
    ]
    df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

    # Standardize labels
    df['Lead Source'] = df['Lead Source'].str.strip().str.lower()
    df['Lead Source'] = df['Lead Source'].replace({'google': 'Google'})
    df['Lead Source'] = df['Lead Source'].fillna('Missing')

    # Replace 'Select' with 'Missing'
    select_to_missing_cols = [
        'Specialization', 'Lead Profile', 'City', 'How did you hear about X Education'
    ]
    for col in select_to_missing_cols:
        df[col] = df[col].replace('Select', 'Missing')
        df[col] = df[col].fillna('Missing')

    # Fill remaining high-null categorical columns
    df['Country'] = df['Country'].fillna('India')
    df['What is your current occupation'] = df['What is your current occupation'].fillna('Unemployed')
    df['What matters most to you in choosing a course'] = df['What matters most to you in choosing a course'].fillna('Better Career Prospects')
    df['Tags'] = df['Tags'].fillna('Missing')
    df['Lead Quality'] = df['Lead Quality'].fillna('Missing')

    # Ordinal encoding for Asymmetrique Index columns
    ordinal_map = {"01.High": 3, "02.Medium": 2, "03.Low": 1}
    df["Asymmetrique Profile Index"] = df["Asymmetrique Profile Index"].map(ordinal_map)
    df["Asymmetrique Activity Index"] = df["Asymmetrique Activity Index"].map(ordinal_map)

    # Fill with median
    for col in [
        'Asymmetrique Profile Index', 'Asymmetrique Activity Index',
        'Asymmetrique Profile Score', 'Asymmetrique Activity Score',
        'TotalVisits', 'Page Views Per Visit'
    ]:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())

    # Done
    print("Data cleaned and ready. Remaining nulls:")
    print(df.isnull().sum()[df.isnull().sum() > 0])


    # Mapping the labels of the columns with similar meaning and if more labels to convert to less labels
    lead_source_map = {
        'google': 'Google',
        'google ads': 'Google',
        'organic search': 'Organic Search',
        'olark chat': 'Olark Chat',
        'direct traffic': 'Direct Traffic',
        'reference': 'Reference',
        'welingak website': 'Welingak',
        'facebook': 'Social Media',
        'bing': 'Other',
        'click2call': 'Other',
        'press_release': 'Other',
        'social media': 'Social Media',
        'live chat': 'Olark Chat',
        'youtubechannel': 'Other',
        'testone': 'Other',
        'pay per click ads': 'Other',
        'welearnblog_home': 'Other',
        'welearn': 'Other',
        'blog': 'Other',
        'nc_edm': 'Other'
    }

    specialization_map = {
        'finance management': 'Finance',
        'banking, investment and insurance': 'Finance',
        'human resource management': 'HR',
        'marketing management': 'Marketing',
        'operations management': 'Operations',
        'it projects management': 'IT',
        'business administration': 'Business',
        'supply chain management': 'Operations',
        'e-commerce': 'Business',
        'retail management': 'Marketing',
        'media and advertising': 'Marketing',
        'travel and tourism': 'Other',
        'international business': 'Business',
        'healthcare management': 'Other',
        'hospitality management': 'Other',
        'rural and agribusiness': 'Other',
        'e-business': 'Business',
        'services excellence': 'Other',
        'missing': 'Missing',
        'select': 'Missing'
    }

    tags_map = {
        'will revert after reading the email': 'Reverting',
        'interested in other courses': 'Interested Other',
        'interested  in full time mba': 'Interested Other',
        'graduation in progress': 'Interested Other',
        'not doing further education': 'Not Interested',
        'wrong number given': 'Not Reachable',
        'opp hangup': 'Not Reachable',
        'number not provided': 'Not Reachable',
        'invalid number': 'Not Reachable',
        'still thinking': 'Still Thinking',
        'shall take in the next coming month': 'Still Thinking',
        'want to take admission but has financial problems': 'Still Thinking',
        'lost to eins': 'Lost',
        'lost to others': 'Lost',
        'in touch with eins': 'Lost',
        'diploma holder (not eligible)': 'Not Eligible',
        'university not recognized': 'Not Eligible',
        'recognition issue (dec approval)': 'Not Eligible',
        'already a student': 'Already Student',
        'switched off': 'Not Reachable',
        'busy': 'Not Reachable',
        'ringing': 'Not Reachable',
        'missing': 'Missing',
        '': 'Missing',
    }

    lead_quality_map = {
        'high in relevance': 'High',
        'might be': 'Medium',
        'not sure': 'Medium',
        'low in relevance': 'Low',
        'worst': 'Low',
        'missing': 'Missing'
    }

    lead_profile_map = {
        'potential lead': 'Potential',
        'other leads': 'Other',
        'student of someschool': 'Student',
        'lateral student': 'Student',
        'dual specialization student': 'Student',
        'select': 'Missing',
        'missing': 'Missing'
    }

    heard_map = {
        'online search': 'Online',
        'word of mouth': 'Referral',
        'student of someschool': 'Referral',
        'multiple sources': 'Multiple',
        'advertisements': 'Ads',
        'social media': 'Social',
        'email': 'Direct',
        'sms': 'Direct',
        'other': 'Other',
        'select': 'Missing',
        'missing': 'Missing'
    }

    # Clean cell values only (not column names)
    df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)

    # Apply mappings with correct column names
    df['Lead Source'] = df['Lead Source'].replace(lead_source_map)
    df['Specialization'] = df['Specialization'].replace(specialization_map)
    df['Tags'] = df['Tags'].replace(tags_map)
    df['Lead Quality'] = df['Lead Quality'].replace(lead_quality_map)
    df['Lead Profile'] = df['Lead Profile'].replace(lead_profile_map)
    df['How did you hear about X Education'] = df['How did you hear about X Education'].replace(heard_map)
    print("✅ Data cleaning completed")
    print("Shape of the data after cleaning ",df.shape)
    return df


if __name__=="__main__":
    df_cleaned = clean_data(df)



Duplicate Records: 1281
Data cleaned and ready. Remaining nulls:
Last Activity    103
dtype: int64
✅ Data cleaning completed
Shape of the data after cleaning  (7959, 25)


In [19]:
import pandas as pd
import numpy as np

def feature_engineering(df):
    """
    Perform feature engineering on the cleaned DataFrame.
    """

    # ----------- Feature 1: Engagement Score -----------
    # Proxy for user engagement based on time and activity on site
    if set(['Total Time Spent on Website', 'Page Views Per Visit', 'TotalVisits']).issubset(df.columns):
        df['Engagement Score'] = (
            df['Total Time Spent on Website'] * 0.4 +
            df['Page Views Per Visit'] * 0.3 +
            df['TotalVisits'] * 0.3
        )

    # ----------- Feature 2: Combined Asymmetrique Score -----------
    if set(['Asymmetrique Activity Score', 'Asymmetrique Profile Score']).issubset(df.columns):
        df['Combined Asymmetrique Score'] = (
            df['Asymmetrique Activity Score'] + df['Asymmetrique Profile Score']
        )
        # Removing columns after taking the combined value as new feature
        df.drop(columns=['Asymmetrique Activity Score', 'Asymmetrique Profile Score'])

    # ----------- Feature 3: Is New Tag -----------
    if 'Tags' in df.columns:
        df['Is New Tag'] = df['Tags'].apply(lambda x: 1 if 'student' in str(x).lower() else 0)

    # ----------- Feature 4: Interaction Level based on Activity -----------
    if 'Last Activity' in df.columns:
        high_activity = ['SMS Sent', 'Email Opened', 'Email Link Clicked']
        df['High Interaction'] = df['Last Activity'].apply(lambda x: 1 if x in high_activity else 0)

    # ----------- Feature 5: Was Previously Interested -----------
    if 'Lead Profile' in df.columns:
        df['Potential Lead'] = df['Lead Profile'].apply(lambda x: 1 if 'potential' in str(x).lower() else 0)

    print("✅ Feature engineering complete. Shape after: ", df.shape)
    return df


if __name__ == "__main__":
    
    df_featured = feature_engineering(df_cleaned)
    df.to_csv("featured_data.csv", index=False)


✅ Feature engineering complete. Shape after:  (7959, 30)


In [20]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats.mstats import winsorize

# Function to apply Winsorization
def winsorize_columns(df, columns, limits=(0.01, 0.01)):
    for col in columns:
        try:
            df[col] = winsorize(df[col], limits=limits)
        except Exception as e:
            print(f"[!] Could not winsorize column '{col}': {e}")
    return df

# Main preprocessing pipeline function
def preprocess_data(df, save_dir="pkl_joblib_files"):
    os.makedirs(save_dir, exist_ok=True)

    # ----------------------
    # 1. Define features
    # ----------------------
    target_col = 'Converted'
    
    numeric_cols = [
        'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit',
        'Asymmetrique Activity Score', 'Asymmetrique Profile Score'
    ]

    ordinal_cols = [
        'Asymmetrique Activity Index', 'Asymmetrique Profile Index', 'Lead Quality'
    ]
    ordinal_map = [
        [1, 2, 3],                        # Activity Index (Low=1, Medium=2, High=3)
        [1, 2, 3],                        # Profile Index
        ['Low', 'Medium', 'High', 'Missing']  # Lead Quality
    ]

    categorical_cols = [
        'Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity',
        'Country', 'Specialization', 'How did you hear about X Education',
        'What is your current occupation', 'What matters most to you in choosing a course',
        'Tags', 'Lead Profile', 'City',
        'A free copy of Mastering The Interview', 'Last Notable Activity'
    ]

    # ----------------------
    # 2. Train-Test Split
    # ----------------------
    X = df.drop(columns=[target_col])
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # ----------------------
    # 3. Winsorize outliers
    # ----------------------
    X_train = winsorize_columns(X_train.copy(), numeric_cols)
    X_test = winsorize_columns(X_test.copy(), numeric_cols)

    # ----------------------
    # 4. Build Transformers
    # ----------------------

    # Numeric Pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('yeojohnson', PowerTransformer(method='yeo-johnson')),
        ('scaler', MinMaxScaler())
    ])

    # Ordinal Pipeline
    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(categories=ordinal_map))
    ])

    # Categorical Pipeline
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    # Combine all
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    # ----------------------
    # 5. Fit + Transform
    # ----------------------
    X_train_trans = preprocessor.fit_transform(X_train)
    X_test_trans = preprocessor.transform(X_test)

    # ----------------------
    # 6. Save Preprocessor
    # ----------------------
    joblib.dump(preprocessor, os.path.join(save_dir, "preprocessor.pkl"))

    # ----------------------
    # 7. Get Feature Names
    # ----------------------
    cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    feature_names = numeric_cols + ordinal_cols + list(cat_feature_names)

    X_train_df = pd.DataFrame(X_train_trans, columns=feature_names, index=X_train.index)
    X_test_df = pd.DataFrame(X_test_trans, columns=feature_names, index=X_test.index)

    print("✅ Preprocessing complete. Artifacts saved to:", save_dir)
    print("🧾 X_train shape:", X_train_df.shape)
    print("🧾 X_test shape :", X_test_df.shape)
    print("🎯 y_train dist:\n", y_train.value_counts(normalize=True))

    return X_train_df, X_test_df, y_train, y_test


# To test directly
if __name__ == "__main__":
    
    X_train, X_test, y_train, y_test = preprocess_data(df_featured)


✅ Preprocessing complete. Artifacts saved to: pkl_joblib_files
🧾 X_train shape: (6367, 145)
🧾 X_test shape : (1592, 145)
🎯 y_train dist:
 Converted
0    0.600126
1    0.399874
Name: proportion, dtype: float64


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


Selecting best model based on Accuracy

In [None]:
import mlflow
import mlflow.sklearn
import os
import joblib
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

def train_evaluate_and_select_model(X_train, y_train, X_test, y_test, save_dir="pkl_joblib_files", model_name="LeadConversionModel"):
    os.makedirs(save_dir, exist_ok=True)

    # Set MLflow tracking URI and experiment
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("lead_conversion_experiment")

    models = {
        'logistic_regression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {
                'C': [0.1, 1.0, 10.0],
                'penalty': ['l2'],
                'solver': ['lbfgs']
            }
        },
        'random_forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [100],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5]
            }
        },
        'xgboost': {
            'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
            'params': {
                'n_estimators': [100],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5]
            }
        },
        'lightgbm': {
            'model': LGBMClassifier(random_state=42),
            'params': {
                'n_estimators': [100],
                'learning_rate': [0.05, 0.1],
                'max_depth': [-1, 5]
            }
        }
    }

    best_model = None
    best_score = 0
    best_name = None
    best_run_id = None

    for name, config in models.items():
        with mlflow.start_run(run_name=f"{name}_run") as run:
            print(f"Training {name}...")

            grid = GridSearchCV(config['model'], config['params'], 
                                cv=5, scoring='f1', 
                                n_jobs=-1, verbose=0)
            grid.fit(X_train, y_train)

            f1_cv = grid.best_score_
            mlflow.log_params(grid.best_params_)
            mlflow.log_metric("f1_cv", f1_cv)

            mlflow.sklearn.log_model(grid.best_estimator_, artifact_path="model")

            if f1_cv > best_score:
                best_score = f1_cv
                best_model = grid.best_estimator_
                best_name = name
                best_run_id = run.info.run_id

    # Final evaluation on test data
    y_pred_test = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)

    print(f"\nBest Model: {best_name}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

    # Save model locally
    model_path = os.path.join(save_dir, "model.pkl")
    joblib.dump(best_model, model_path)
    print(f"\nModel saved to: {model_path}")

    # Register model in MLflow
    model_uri = f"runs:/{best_run_id}/model"
    result = mlflow.register_model(model_uri=model_uri, name=model_name)

    client = mlflow.tracking.MlflowClient()

    # Transition to Staging
    client.transition_model_version_stage(
        name=model_name,
        version=result.version,
        stage="Staging",
        archive_existing_versions=True
    )
    print(f"Model version {result.version} moved to Staging")

    # Transition to Production
    client.transition_model_version_stage(
        name=model_name,
        version=result.version,
        stage="Production",
        archive_existing_versions=True
    )
    print(f"Model version {result.version} moved to Production")

    return best_model, y_pred_test

if __name__=="__main__":
    best_model, y_pred_test = train_evaluate_and_select_model(X_train, y_train, X_test, y_test)


Training logistic_regression...


2025/07/17 14:23:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run logistic_regression_run at: http://localhost:5000/#/experiments/199496956156630131/runs/feb0b59162c64f2aba1c95870830c09b.
2025/07/17 14:23:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/199496956156630131.


Training random_forest...


2025/07/17 14:23:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_run at: http://localhost:5000/#/experiments/199496956156630131/runs/5775b36cf13d4e1fb22821a5f9749b24.
2025/07/17 14:23:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/199496956156630131.


Training xgboost...


Parameters: { "use_label_encoder" } are not used.

2025/07/17 14:23:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_run at: http://localhost:5000/#/experiments/199496956156630131/runs/844f22aa58df445da7f171d61c7ccbcb.
2025/07/17 14:23:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/199496956156630131.


Training lightgbm...
[LightGBM] [Info] Number of positive: 2546, number of negative: 3821
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 6367, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.399874 -> initscore=-0.405989
[LightGBM] [Info] Start training from score -0.405989


2025/07/17 14:23:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_run at: http://localhost:5000/#/experiments/199496956156630131/runs/b9f3bc46114f4b3fa20206b5946f0515.
2025/07/17 14:23:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/199496956156630131.
Registered model 'LeadConversionModel' already exists. Creating a new version of this model...
2025/07/17 14:23:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LeadConversionModel, version 3
Created version '3' of model 'LeadConversionModel'.
``mlflow.tracking.client.MlflowClient.transition_model_version_stage`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/latest/model-registry.html#migrating-from-stages



Best Model: xgboost
Test Accuracy: 0.9271
Test F1 Score: 0.9075

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94       955
           1       0.92      0.89      0.91       637

    accuracy                           0.93      1592
   macro avg       0.93      0.92      0.92      1592
weighted avg       0.93      0.93      0.93      1592


Model saved to: pkl_joblib_files\model.pkl
Model version 3 moved to Staging


``mlflow.tracking.client.MlflowClient.transition_model_version_stage`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/latest/model-registry.html#migrating-from-stages


Model version 3 moved to Production


Selecting best model based on Precision

In [None]:
import os
import joblib
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    precision_score,
    classification_report,
    accuracy_score,
    f1_score
)

def train_and_register_best_classifier(X_train, y_train, X_test, y_test, save_dir="pkl_joblib_files", model_name="LeadConversionClassifier"):
    os.makedirs(save_dir, exist_ok=True)

    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("Lead_Conversion_Classification")

    models = {
        'logistic_regression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {
                'C': [0.01, 0.1, 1, 10],
                'penalty': ['l2'],
                'solver': ['lbfgs']
            }
        },
        'random_forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5]
            }
        },
        'xgboost': {
            'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [3, 5, 7],
                'learning_rate': [0.01, 0.05, 0.1]
            }
        },
        'lightgbm': {
            'model': LGBMClassifier(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [-1, 5, 10],
                'learning_rate': [0.01, 0.05, 0.1]
            }
        }
    }

    best_model = None
    best_precision = 0
    best_name = None
    best_run_id = None

    for name, config in models.items():
        with mlflow.start_run(run_name=f"{name}_run") as run:
            print(f"🔍 Tuning {name}...")

            grid = GridSearchCV(
                config['model'], config['params'],
                cv=5, scoring='precision', n_jobs=-1
            )
            grid.fit(X_train, y_train)

            # Predictions on test set
            y_pred = grid.best_estimator_.predict(X_test)
            class1_precision = precision_score(y_test, y_pred, pos_label=1)

            mlflow.log_params(grid.best_params_)
            mlflow.log_metric("test_precision_class1", class1_precision)
            mlflow.sklearn.log_model(grid.best_estimator_, artifact_path="model")

            if class1_precision > best_precision:
                best_model = grid.best_estimator_
                best_precision = class1_precision
                best_name = name
                best_run_id = run.info.run_id

    # Final evaluation
    y_final_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_final_pred)
    f1 = f1_score(y_test, y_final_pred)
    precision = precision_score(y_test, y_final_pred, pos_label=1)

    print(f"✅ Best Model: {best_name}")
    print(f"🔢 Accuracy: {acc:.4f}")
    print(f"🎯 Precision (class 1): {precision:.4f}")
    print(f"📊 F1 Score: {f1:.4f}")
    print("\n📝 Classification Report:\n", classification_report(y_test, y_final_pred))

    # Save model locally
    model_path = os.path.join(save_dir, "best_model.pkl")
    joblib.dump(best_model, model_path)
    print(f"📁 Model saved to: {model_path}")

    # Register best model to MLflow Model Registry
    model_uri = f"runs:/{best_run_id}/model"
    result = mlflow.register_model(model_uri=model_uri, name=model_name)

    client = mlflow.tracking.MlflowClient()

    # Transition to Staging and then Production
    client.transition_model_version_stage(
        name=model_name, version=result.version, stage="Staging", archive_existing_versions=True
    )
    print(f"🚀 Model version {result.version} moved to Staging")

    client.transition_model_version_stage(
        name=model_name, version=result.version, stage="Production", archive_existing_versions=True
    )
    print(f"🏁 Model version {result.version} moved to Production")

    return best_model, y_final_pred


# Example usage (you need to define `X_train`, `y_train`, `X_test`, `y_test` beforehand)
if __name__ == "__main__":
    best_model, y_pred_test = train_and_register_best_classifier(X_train, y_train, X_test, y_test)


🔍 Tuning logistic_regression...


2025/07/17 14:24:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run logistic_regression_run at: http://localhost:5000/#/experiments/410342575703778846/runs/5ea10dd6481942cabe039541e91ce2fe.
2025/07/17 14:24:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/410342575703778846.


🔍 Tuning random_forest...


2025/07/17 14:25:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_run at: http://localhost:5000/#/experiments/410342575703778846/runs/33c09cc911e94c5eaf63ca1a82295599.
2025/07/17 14:25:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/410342575703778846.


🔍 Tuning xgboost...


Parameters: { "use_label_encoder" } are not used.

2025/07/17 14:25:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_run at: http://localhost:5000/#/experiments/410342575703778846/runs/aa77914f340b403d981f012ffcd700db.
2025/07/17 14:25:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/410342575703778846.


🔍 Tuning lightgbm...
[LightGBM] [Info] Number of positive: 2546, number of negative: 3821
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 6367, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.399874 -> initscore=-0.405989
[LightGBM] [Info] Start training from score -0.405989


2025/07/17 14:25:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_run at: http://localhost:5000/#/experiments/410342575703778846/runs/30447a9e24a94ebbaf9b4838801c9c22.
2025/07/17 14:25:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/410342575703778846.
Registered model 'LeadConversionClassifier' already exists. Creating a new version of this model...
2025/07/17 14:26:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LeadConversionClassifier, version 2


✅ Best Model: xgboost
🔢 Accuracy: 0.8763
🎯 Precision (class 1): 0.9564
📊 F1 Score: 0.8239

📝 Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.98      0.90       955
           1       0.96      0.72      0.82       637

    accuracy                           0.88      1592
   macro avg       0.90      0.85      0.86      1592
weighted avg       0.89      0.88      0.87      1592

📁 Model saved to: pkl_joblib_files\best_model.pkl


Created version '2' of model 'LeadConversionClassifier'.
``mlflow.tracking.client.MlflowClient.transition_model_version_stage`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/latest/model-registry.html#migrating-from-stages
``mlflow.tracking.client.MlflowClient.transition_model_version_stage`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/latest/model-registry.html#migrating-from-stages


🚀 Model version 2 moved to Staging
🏁 Model version 2 moved to Production


In [None]:
import pandas as pd
import os
import mlflow
from datetime import datetime
from evidently import Report
from evidently.presets import DataDriftPreset
import json
from pathlib import Path
from mlflow.exceptions import MlflowException
from mlflow.tracking import MlflowClient

def split_data(df, target_column="Converted"):
    """
    Splits the input DataFrame into train, validation, and test sets (X and y).
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test
    """
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame.")

    # Split features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # First split: Train vs Temp (Val+Test)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

    # Second split: Val vs Test (from temp)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test


def log_evidently_report(reference_data, current_data, dataset_name="train_vs_test"):
    
    #  Align columns: use only the intersection to avoid partial-column errors
    common_cols = set(reference_data.columns).intersection(current_data.columns)
    if not common_cols:
        print(f"⚠️ No common columns between reference and {dataset_name}; skipping Evidently report.")
        return
    ref = reference_data[sorted(common_cols)]
    cur = current_data[sorted(common_cols)]

    #  Run the Evidently report (drift + summary)
    report = Report(metrics=[DataDriftPreset(), DataSummaryPreset()])
    result = report.run(reference_data=ref, current_data=cur)

    #  Ensure local save directory exists
    save_dir = Path.cwd() / "evidently_reports"
    save_dir.mkdir(parents=True, exist_ok=True)

    #  Save HTML and JSON
    ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    html_path = save_dir / f"evidently_{dataset_name}_{ts}.html"
    json_path = save_dir / f"evidently_{dataset_name}_{ts}.json"
    result.save_html(str(html_path))
    with open(json_path, "w", encoding="utf-8") as fp:
        fp.write(result.json())

    #  Log artifacts to MLflow
    mlflow.log_artifact(str(html_path), artifact_path="evidently")
    mlflow.log_artifact(str(json_path), artifact_path="evidently")
    print(f"📄 Logged HTML: {html_path.name}")
    print(f"🗄️  Logged JSON: {json_path.name}")

    #  Load JSON and extract metrics list
    with open(json_path, "r", encoding="utf-8") as fp:
        report_json = json.load(fp)
    metrics_list = report_json.get("metrics", [])

    #  Overall drifted columns metrics
    drift_entry = next((m for m in metrics_list if m.get("metric_id", "").startswith("DriftedColumnsCount")), None)
    if drift_entry:
        count = drift_entry["value"]["count"]
        share = drift_entry["value"]["share"]
        mlflow.log_metric("drifted_columns_count", float(count))
        mlflow.log_metric("drifted_columns_share", float(share))
        print(f"🔢 drifted_columns_count = {count}")
        print(f"🔢 drifted_columns_share = {share}")
    else:
        print("⚠️ No DriftedColumnsCount entry found.")

    #  Row and column counts
    rowcount = next((m["value"] for m in metrics_list if m.get("metric_id") == "RowCount()"), None)
    colcount = next((m["value"] for m in metrics_list if m.get("metric_id") == "ColumnCount()"), None)
    if rowcount is not None:
        mlflow.log_metric("dataset_row_count", float(rowcount))
        print(f"🔢 dataset_row_count = {rowcount}")
    if colcount is not None:
        mlflow.log_metric("dataset_column_count", float(colcount))
        print(f"🔢 dataset_column_count = {colcount}")

    #  Per-feature value drift metrics
    for m in metrics_list:
        mid = m.get("metric_id", "")
        if mid.startswith("ValueDrift(column="):
            # extract column name
            col = mid.split("=")[1].rstrip(")")
            val = m.get("value")
            if isinstance(val, (int, float)):
                mlflow.log_metric(f"drift_{col}", float(val))
                print(f"🔢 drift_{col} = {val}")
    
    print("✅ All requested drift & dataset metrics logged to MLflow.")


EXPERIMENT_NAME = "Lead Conversion Prediction Evidently"

def main():
    client = MlflowClient()

    # ─── 1️⃣ Ensure the MLflow experiment exists and is active ───
    exp = client.get_experiment_by_name(EXPERIMENT_NAME)
    if exp is None:
        exp_id = client.create_experiment(EXPERIMENT_NAME)
        print(f"✅ Created new experiment '{EXPERIMENT_NAME}' (ID={exp_id})")
    elif exp.lifecycle_stage == "deleted":
        client.restore_experiment(exp.experiment_id)
        print(f"🔄 Restored deleted experiment '{EXPERIMENT_NAME}' (ID={exp.experiment_id})")
    else:
        print(f"ℹ️ Using existing experiment '{EXPERIMENT_NAME}' (ID={exp.experiment_id})")

    mlflow.set_experiment(EXPERIMENT_NAME)

    # ─── 2️⃣ Start your MLflow run ───
    with mlflow.start_run(run_name="Lead Conversion Prediction Drift Detection"):
        # Load and split
        df = load_data()
        Xtr, Xv, Xt, ytr, yv, yt = split_data(df)

        # Keep raw for Evidently
        df_train = Xtr.copy()
        df_test  = Xt.copy()

        df_train = df_train.dropna(axis=1, how='all')
        df_test = df_test.dropna(axis=1, how='all')

        # Log Evidently reports
        log_evidently_report(df_train, df_test, dataset_name="train_vs_test")

main()


ℹ️ Using existing experiment 'Lead Conversion Prediction Evidently' (ID=321335971120077205)
📄 Logged HTML: evidently_train_vs_test_2025-07-17_16-13-25.html
🗄️  Logged JSON: evidently_train_vs_test_2025-07-17_16-13-25.json
🔢 drifted_columns_count = 0.0
🔢 drifted_columns_share = 0.0
🔢 dataset_row_count = 1848.0
🔢 dataset_column_count = 36.0
🔢 drift_Asymmetrique Activity Score = 0.08154543204692331
🔢 drift_Asymmetrique Profile Score = 0.03366386451690208
🔢 drift_Lead Number = 0.03991236408159937
🔢 drift_Page Views Per Visit = 0.04395116701824102
🔢 drift_Total Time Spent on Website = 0.009859456860784861
🔢 drift_TotalVisits = 0.036930827545720725
🔢 drift_A free copy of Mastering The Interview = 0.008688713708195937
🔢 drift_Asymmetrique Activity Index = 0.03248798683593431
🔢 drift_Asymmetrique Profile Index = 0.012893295537662212
🔢 drift_City = 0.026200523954586385
🔢 drift_Country = 0.07000374873909854
🔢 drift_Digital Advertisement = 0.0030139568605141265
🔢 drift_Do Not Call = 0.00790679082

2025/07/17 16:13:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run Preprocessing and Tuning at: http://localhost:5000/#/experiments/321335971120077205/runs/2eb672ca7d1e4f0aa1a14b2c80e7594f.
2025/07/17 16:13:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/321335971120077205.
