### Import Libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import seaborn as sns

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


### Set Random Seed

In [9]:
# Set random seed for reproducibility
np.random.seed(42)

print("✓ Random seed set to 42")

✓ Random seed set to 42


### Create Synthetic Dataset

In [10]:
print("=" * 60)
print("CREATING SYNTHETIC IMBALANCED CLASSIFICATION DATASET")
print("=" * 60)

# Create imbalanced dataset using sklearn's make_classification
X, y = make_classification(
    n_samples=20000,           # Total number of samples
    n_features=5,              # Number of features
    n_informative=4,           # Number of informative features
    n_redundant=1,             # Number of redundant features
    n_classes=2,               # Binary classification
    n_clusters_per_class=2,    # Number of clusters per class
    weights=[0.85, 0.15],      # Class distribution (85% class 0, 15% class 1)
    flip_y=0.01,               # Fraction of samples with flipped labels (noise)
    class_sep=0.8,             # Factor multiplying the hypercube size
    random_state=42
)

print("✓ Synthetic dataset created")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

CREATING SYNTHETIC IMBALANCED CLASSIFICATION DATASET
✓ Synthetic dataset created
Features shape: (20000, 5)
Target shape: (20000,)


In [11]:
# Create DataFrame with meaningful column names
feature_names = ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print("✓ DataFrame created")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

✓ DataFrame created
Dataset shape: (20000, 6)
Columns: ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'target']


In [12]:
# Analyze class distribution
print("CLASS DISTRIBUTION ANALYSIS")
print("-" * 40)

class_counts = df['target'].value_counts().sort_index()
class_percentages = df['target'].value_counts(normalize=True).sort_index() * 100

print("Class counts:")
for class_label, count in class_counts.items():
    percentage = class_percentages[class_label]
    print(f"  Class {class_label}: {count:,} ({percentage:.1f}%)")

imbalance_ratio = class_counts[0] / class_counts[1]
print(f"\nClass imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"Minority class percentage: {class_percentages[1]:.1f}%")

CLASS DISTRIBUTION ANALYSIS
----------------------------------------
Class counts:
  Class 0: 16,921 (84.6%)
  Class 1: 3,079 (15.4%)

Class imbalance ratio: 5.50:1
Minority class percentage: 15.4%


In [13]:
print("\n" + "="*60)
print("SYNTHETIC DATASET CREATION COMPLETED!")
print("="*60)

print(f"📊 Dataset Details:")
print(f"   • Total samples: {len(df):,}")
print(f"   • Features: {len(feature_names)}")
print(f"   • Classes: 2 (Binary classification)")
print(f"   • Class 0: {class_counts[0]:,} samples ({class_percentages[0]:.1f}%)")
print(f"   • Class 1: {class_counts[1]:,} samples ({class_percentages[1]:.1f}%)")
print(f"   • Imbalance ratio: {imbalance_ratio:.2f}:1")

print(f"\n🎯 Dataset is ready for classification experiments!")
print(f"   • Perfect for testing class imbalance handling techniques")
print(f"   • Suitable for ML model comparison")
print(f"   • Can be used with sampling strategies (SMOTE, undersampling, etc.)")


SYNTHETIC DATASET CREATION COMPLETED!
📊 Dataset Details:
   • Total samples: 20,000
   • Features: 5
   • Classes: 2 (Binary classification)
   • Class 0: 16,921 samples (84.6%)
   • Class 1: 3,079 samples (15.4%)
   • Imbalance ratio: 5.50:1

🎯 Dataset is ready for classification experiments!
   • Perfect for testing class imbalance handling techniques
   • Suitable for ML model comparison
   • Can be used with sampling strategies (SMOTE, undersampling, etc.)


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from imblearn.combine import SMOTETomek
import xgboost as xgb
from collections import Counter
import warnings
warnings.filterwarnings('ignore')


In [15]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature columns: {list(X.columns)}")

# Check original class distribution
print(f"\nOriginal class distribution: {Counter(y)}")

Features shape: (20000, 5)
Target shape: (20000,)
Feature columns: ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']

Original class distribution: Counter({0: 16921, 1: 3079})


In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"✓ Data split completed")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set class distribution: {Counter(y_train)}")
print(f"Test set class distribution: {Counter(y_test)}")

✓ Data split completed
Training set shape: (16000, 5)
Test set shape: (4000, 5)
Training set class distribution: Counter({0: 13537, 1: 2463})
Test set class distribution: Counter({0: 3384, 1: 616})


In [17]:
# Apply SMOTE-Tomek for class balancing
smotek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smotek.fit_resample(X_train, y_train)

print(f"✓ SMOTE-Tomek applied successfully")
print(f"Original training set: {Counter(y_train)}")
print(f"Balanced training set: {Counter(y_train_balanced)}")
print(f"Balanced training set shape: {X_train_balanced.shape}")

✓ SMOTE-Tomek applied successfully
Original training set: Counter({0: 13537, 1: 2463})
Balanced training set: Counter({0: 13397, 1: 13397})
Balanced training set shape: (26794, 5)


In [18]:
# Define models for training
models_config = {
    'logistic_regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'name': 'Logistic Regression'
    },
    'random_forest': {
        'model': RandomForestClassifier(random_state=42, n_estimators=100),
        'name': 'Random Forest'
    },
    'xgboost': {
        'model': xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False),
        'name': 'XGBoost'
    }
}

print("✓ Model configurations defined")
for key, config in models_config.items():
    print(f"  - {config['name']}")

✓ Model configurations defined
  - Logistic Regression
  - Random Forest
  - XGBoost


In [19]:
print("="*60)
print("TRAINING MODELS WITHOUT CLASS IMBALANCE HANDLING")
print("="*60)

# Dictionary to store models and results
models_no_balance = {}
results_no_balance = {}

for model_key, config in models_config.items():
    print(f"\nTraining {config['name']}...")
    
    # Create and train model
    model = config['model']
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    # Store model and results
    models_no_balance[model_key] = model
    results_no_balance[model_key] = {
        'accuracy': accuracy,
        'auc_score': auc_score,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"✓ {config['name']} trained successfully")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  AUC Score: {auc_score:.4f}")

TRAINING MODELS WITHOUT CLASS IMBALANCE HANDLING

Training Logistic Regression...
✓ Logistic Regression trained successfully
  Accuracy: 0.8748
  AUC Score: 0.6716

Training Random Forest...
✓ Random Forest trained successfully
  Accuracy: 0.9403
  AUC Score: 0.9405

Training XGBoost...
✓ XGBoost trained successfully
  Accuracy: 0.9370
  AUC Score: 0.9437


In [20]:
print("="*60)
print("TRAINING MODELS WITH SMOTE-TOMEK CLASS BALANCING")
print("="*60)

# Dictionary to store balanced models and results
models_balanced = {}
results_balanced = {}

for model_key, config in models_config.items():
    print(f"\nTraining {config['name']} with SMOTE-Tomek...")
    
    # Create fresh model instance (to avoid any sklearn issues)
    if model_key == 'logistic_regression':
        model = LogisticRegression(random_state=42, max_iter=1000)
    elif model_key == 'random_forest':
        model = RandomForestClassifier(random_state=42, n_estimators=100)
    elif model_key == 'xgboost':
        model = xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)
    
    # Train on balanced data
    model.fit(X_train_balanced, y_train_balanced)
    
    # Make predictions on same test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    # Store model and results
    models_balanced[model_key] = model
    results_balanced[model_key] = {
        'accuracy': accuracy,
        'auc_score': auc_score,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"✓ {config['name']} trained with SMOTE-Tomek successfully")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  AUC Score: {auc_score:.4f}")

TRAINING MODELS WITH SMOTE-TOMEK CLASS BALANCING

Training Logistic Regression with SMOTE-Tomek...
✓ Logistic Regression trained with SMOTE-Tomek successfully
  Accuracy: 0.6795
  AUC Score: 0.6806

Training Random Forest with SMOTE-Tomek...
✓ Random Forest trained with SMOTE-Tomek successfully
  Accuracy: 0.9263
  AUC Score: 0.9404

Training XGBoost with SMOTE-Tomek...
✓ XGBoost trained with SMOTE-Tomek successfully
  Accuracy: 0.9183
  AUC Score: 0.9434


### Import MLflow Libraries

In [21]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.tracking
from mlflow.tracking import MlflowClient



In [25]:
# Set up MLflow experiment
experiment_name = "Synthetic_Imbalanced_Dataset"

try:
    # Set MLflow tracking URI (adjust if needed)
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    
    # Set or create experiment
    mlflow.set_experiment(experiment_name)
    
    print(f"✓ MLflow experiment '{experiment_name}' setup successful")
    print(f"✓ Tracking URI: {mlflow.get_tracking_uri()}")
    
except Exception as e:
    print(f"⚠ MLflow setup warning: {e}")
    print("Continuing with default MLflow setup...")
    mlflow.set_experiment(experiment_name)

2025/06/14 11:12:42 INFO mlflow.tracking.fluent: Experiment with name 'Synthetic_Imbalanced_Dataset' does not exist. Creating a new experiment.


✓ MLflow experiment 'Synthetic_Imbalanced_Dataset' setup successful
✓ Tracking URI: http://127.0.0.1:5000


In [26]:
print("="*60)
print("LOGGING MODELS WITHOUT CLASS BALANCING TO MLFLOW")
print("="*60)

for model_key, config in models_config.items():
    model_name = config['name']
    model = models_no_balance[model_key]
    results = results_no_balance[model_key]
    
    print(f"\nLogging {model_name} (No Balancing)...")
    
    # Create run name
    run_name = f"{model_name}_No_Balancing"
    
    try:
        with mlflow.start_run(run_name=run_name):
            
            # Log parameters
            if model_key == 'logistic_regression':
                mlflow.log_params({
                    'model_type': 'Logistic Regression',
                    'C': model.C,
                    'penalty': model.penalty,
                    'solver': model.solver,
                    'max_iter': model.max_iter,
                    'random_state': model.random_state,
                    'class_balancing': 'None'
                })
                
            elif model_key == 'random_forest':
                mlflow.log_params({
                    'model_type': 'Random Forest',
                    'n_estimators': model.n_estimators,
                    'max_depth': model.max_depth,
                    'min_samples_split': model.min_samples_split,
                    'min_samples_leaf': model.min_samples_leaf,
                    'random_state': model.random_state,
                    'class_balancing': 'None'
                })
                
            elif model_key == 'xgboost':
                mlflow.log_params({
                    'model_type': 'XGBoost',
                    'n_estimators': model.n_estimators,
                    'max_depth': model.max_depth,
                    'learning_rate': model.learning_rate,
                    'subsample': model.subsample,
                    'colsample_bytree': model.colsample_bytree,
                    'random_state': model.random_state,
                    'class_balancing': 'None'
                })
            
            # Log training data info
            mlflow.log_params({
                'train_size': len(X_train),
                'test_size': len(X_test),
                'n_features': X_train.shape[1],
                'class_0_train': sum(y_train == 0),
                'class_1_train': sum(y_train == 1),
                'original_imbalance_ratio': sum(y_train == 0) / sum(y_train == 1)
            })
            
            # Log metrics
            mlflow.log_metrics({
                'accuracy': results['accuracy'],
                'auc_score': results['auc_score']
            })
            
            # Calculate and log additional metrics
            y_pred = results['predictions']
            cm = confusion_matrix(y_test, y_pred)
            tn, fp, fn, tp = cm.ravel()
            
            # Calculate precision, recall, f1 for each class
            from sklearn.metrics import precision_recall_fscore_support
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
            
            mlflow.log_metrics({
                'precision_class_0': precision[0],
                'recall_class_0': recall[0],
                'f1_class_0': f1[0],
                'precision_class_1': precision[1],
                'recall_class_1': recall[1],
                'f1_class_1': f1[1],
                'true_negatives': int(tn),
                'false_positives': int(fp),
                'false_negatives': int(fn),
                'true_positives': int(tp),
                'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
                'sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0
            })
            
            # Log model
            if model_key == 'xgboost':
                mlflow.xgboost.log_model(model, "model", input_example=X_test.iloc[:5])
            else:
                mlflow.sklearn.log_model(model, "model", input_example=X_test.iloc[:5])
            
            print(f"✓ {model_name} (No Balancing) logged successfully")
            
    except Exception as e:
        print(f"✗ Error logging {model_name} (No Balancing): {e}")



LOGGING MODELS WITHOUT CLASS BALANCING TO MLFLOW

Logging Logistic Regression (No Balancing)...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

✓ Logistic Regression (No Balancing) logged successfully
🏃 View run Logistic Regression_No_Balancing at: http://127.0.0.1:5000/#/experiments/154537210038587388/runs/4c83d3772bf64a7a8a94f52fe0fc60d2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/154537210038587388

Logging Random Forest (No Balancing)...




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



✓ Random Forest (No Balancing) logged successfully
🏃 View run Random Forest_No_Balancing at: http://127.0.0.1:5000/#/experiments/154537210038587388/runs/74818e8528464611af64ec19c3249148
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/154537210038587388

Logging XGBoost (No Balancing)...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

✓ XGBoost (No Balancing) logged successfully
🏃 View run XGBoost_No_Balancing at: http://127.0.0.1:5000/#/experiments/154537210038587388/runs/81d58f88f19d47bfb85177477442747a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/154537210038587388


In [27]:
print("="*60)
print("LOGGING MODELS WITH SMOTE-TOMEK BALANCING TO MLFLOW")
print("="*60)

for model_key, config in models_config.items():
    model_name = config['name']
    model = models_balanced[model_key]
    results = results_balanced[model_key]
    
    print(f"\nLogging {model_name} (With SMOTE-Tomek)...")
    
    # Create run name
    run_name = f"{model_name}_SMOTE_Tomek"
    
    try:
        with mlflow.start_run(run_name=run_name):
            
            # Log parameters
            if model_key == 'logistic_regression':
                mlflow.log_params({
                    'model_type': 'Logistic Regression',
                    'C': model.C,
                    'penalty': model.penalty,
                    'solver': model.solver,
                    'max_iter': model.max_iter,
                    'random_state': model.random_state,
                    'class_balancing': 'SMOTE-Tomek'
                })
                
            elif model_key == 'random_forest':
                mlflow.log_params({
                    'model_type': 'Random Forest',
                    'n_estimators': model.n_estimators,
                    'max_depth': model.max_depth,
                    'min_samples_split': model.min_samples_split,
                    'min_samples_leaf': model.min_samples_leaf,
                    'random_state': model.random_state,
                    'class_balancing': 'SMOTE-Tomek'
                })
                
            elif model_key == 'xgboost':
                mlflow.log_params({
                    'model_type': 'XGBoost',
                    'n_estimators': model.n_estimators,
                    'max_depth': model.max_depth,
                    'learning_rate': model.learning_rate,
                    'subsample': model.subsample,
                    'colsample_bytree': model.colsample_bytree,
                    'random_state': model.random_state,
                    'class_balancing': 'SMOTE-Tomek'
                })
            
            # Log training data info (balanced)
            mlflow.log_params({
                'train_size_original': len(X_train),
                'train_size_balanced': len(X_train_balanced),
                'test_size': len(X_test),
                'n_features': X_train.shape[1],
                'class_0_train_original': sum(y_train == 0),
                'class_1_train_original': sum(y_train == 1),
                'class_0_train_balanced': sum(y_train_balanced == 0),
                'class_1_train_balanced': sum(y_train_balanced == 1),
                'original_imbalance_ratio': sum(y_train == 0) / sum(y_train == 1),
                'balanced_imbalance_ratio': sum(y_train_balanced == 0) / sum(y_train_balanced == 1)
            })
            
            # Log metrics
            mlflow.log_metrics({
                'accuracy': results['accuracy'],
                'auc_score': results['auc_score']
            })
            
            # Calculate and log additional metrics
            y_pred = results['predictions']
            cm = confusion_matrix(y_test, y_pred)
            tn, fp, fn, tp = cm.ravel()
            
            # Calculate precision, recall, f1 for each class
            from sklearn.metrics import precision_recall_fscore_support
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
            
            mlflow.log_metrics({
                'precision_class_0': precision[0],
                'recall_class_0': recall[0],
                'f1_class_0': f1[0],
                'precision_class_1': precision[1],
                'recall_class_1': recall[1],
                'f1_class_1': f1[1],
                'true_negatives': int(tn),
                'false_positives': int(fp),
                'false_negatives': int(fn),
                'true_positives': int(tp),
                'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
                'sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0
            })
            
            # Log model
            if model_key == 'xgboost':
                mlflow.xgboost.log_model(model, "model", input_example=X_test.iloc[:5])
            else:
                mlflow.sklearn.log_model(model, "model", input_example=X_test.iloc[:5])
            
            print(f"✓ {model_name} (With SMOTE-Tomek) logged successfully")
            
    except Exception as e:
        print(f"✗ Error logging {model_name} (With SMOTE-Tomek): {e}")

LOGGING MODELS WITH SMOTE-TOMEK BALANCING TO MLFLOW

Logging Logistic Regression (With SMOTE-Tomek)...




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



✓ Logistic Regression (With SMOTE-Tomek) logged successfully
🏃 View run Logistic Regression_SMOTE_Tomek at: http://127.0.0.1:5000/#/experiments/154537210038587388/runs/7ac48869ec4a45de8c47bc4ee972cbbc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/154537210038587388

Logging Random Forest (With SMOTE-Tomek)...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



✓ Random Forest (With SMOTE-Tomek) logged successfully
🏃 View run Random Forest_SMOTE_Tomek at: http://127.0.0.1:5000/#/experiments/154537210038587388/runs/4478c5bdd08a449ea5e6ff2a0041d00a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/154537210038587388

Logging XGBoost (With SMOTE-Tomek)...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

✓ XGBoost (With SMOTE-Tomek) logged successfully
🏃 View run XGBoost_SMOTE_Tomek at: http://127.0.0.1:5000/#/experiments/154537210038587388/runs/f9afcb24b377482e870deffbee8c33a5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/154537210038587388


### Register the model

In [28]:
model_name = 'XGBoost_SMOTE'
run_id = input("Enter run ID to load model: ")
model_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=model_uri,
    name=model_name
)

Enter run ID to load model:  f9afcb24b377482e870deffbee8c33a5


Successfully registered model 'XGBoost_SMOTE'.
2025/06/14 11:13:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost_SMOTE, version 1
Created version '1' of model 'XGBoost_SMOTE'.


In [31]:
model_uri = f"models:/{model_name}@challenger"

load_model = mlflow.xgboost.load_model(model_uri)
y_pred = load_model.predict(X_test)
y_pred[:4]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

array([1, 1, 0, 0])

In [32]:

dev_model_uri = f"models:/{model_name}@challenger"
prod_model = "CreditRiskModel"

client = mlflow.MlflowClient()
client.copy_model_version(src_model_uri=dev_model_uri,
                          dst_name=prod_model)

Successfully registered model 'CreditRiskModel'.
Copied version '1' of model 'XGBoost_SMOTE' to version '1' of model 'CreditRiskModel'.


<ModelVersion: aliases=[], creation_timestamp=1749874485896, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1749874485896, metrics=None, model_id=None, name='CreditRiskModel', params=None, run_id='f9afcb24b377482e870deffbee8c33a5', run_link='', source='models:/XGBoost_SMOTE/1', status='READY', status_message=None, tags={}, user_id='', version='1'>

In [34]:
model_uri = f"models:/{prod_model}@champion"

load_model = mlflow.xgboost.load_model(model_uri)
y_pred = load_model.predict(X_test)
y_pred[:4]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

array([1, 1, 0, 0])