In [2]:
import pandas as pd
import numpy as np
from sys import getsizeof
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd, numpy as np, matplotlib.pyplot as plt, os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
import xgboost as xgb
from xgboost import XGBClassifier
from scipy.stats import ttest_ind
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from itertools import cycle
from sklearn.metrics import roc_curve, auc
import mlflow

In [4]:
cohort_1 = pd.read_csv('cohort_WOE_1.csv')
cohort_2 = pd.read_csv('cohort_WOE_2.csv')
cohort_3 = pd.read_csv('cohort_WOE_3.csv')

features = cohort_1.columns[5:]

# Convert only numeric columns to float32
cohort_1[features] = cohort_1[features].astype('float32')
cohort_2[features] = cohort_2[features].astype('float32')
cohort_3[features] = cohort_3[features].astype('float32')

In [5]:
years = [cohort_1, cohort_2, cohort_3]

### Switch target values from 1 to 2 and from 2 to 1

In [6]:
for y in years:
    # Create a mapping dictionary for the swap
    mapping = {0: 0, 1: 2, 2: 1}

    # Apply the mapping to the target variable
    y['target'] = y['target'].map(mapping)

    # Verify the change
    print(y['target'].value_counts())

target
0    179903
2     40989
1     12203
Name: count, dtype: int64
target
0    190170
2     36035
1     12370
Name: count, dtype: int64
target
0    194181
2     34378
1     12336
Name: count, dtype: int64


In [7]:
data = cohort_1.copy()
data2 = cohort_2.copy()
data3 = cohort_3.copy()

### Creating Final 3-class TARGET variable

#### 0 - Didn't open account
#### 1 - Opened and Defaulted
#### 2 - Opened and Didn't defaulted

In [8]:
print(data['target'].value_counts().sort_index())

target
0    179903
1     12203
2     40989
Name: count, dtype: int64


### Multinomial Logistic regression with balanced class 

______________________________________________________________________________________________________________________


### Creating Final 3-class TARGET variable

#### 0 - Didn't open account
#### 1 - Opened and Defaulted
#### 2 - Opened and Didn't defaulted

In [9]:
print("Distribution of target values:")
print(data2['target'].value_counts().sort_index())

Distribution of target values:
target
0    190170
1     12370
2     36035
Name: count, dtype: int64


In [10]:
print("Distribution of target values:")
print(data3['target'].value_counts().sort_index())

Distribution of target values:
target
0    194181
1     12336
2     34378
Name: count, dtype: int64


MLFLOW

In [11]:
# Set the tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Check if the experiment exists, and create it if it doesn't
experiment_name = "Multinomial Models Final"
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/339341380902033810', creation_time=1745256752573, experiment_id='339341380902033810', last_update_time=1745256752573, lifecycle_stage='active', name='Multinomial Models Final', tags={}>

## Logistic Regression with balanced classes AND OOT TEST SET (Cohort 2)

In [25]:
def build_evaluate_multinomial_model(data, data2, model_features, target_col='target', test_size=0.2, random_state=42):
    """
    Build and evaluate a multinomial classification model with class balancing, integrated with MLflow.

    Parameters:
    - data (pandas.DataFrame): Input DataFrame
    - data2 (pandas.DataFrame): Test DataFrame
    - model_features (list): List of selected feature columns
    - target_col (str): Name of the target column
    - test_size (float): Proportion of data for testing
    - random_state (int): Random seed for reproducibility

    Returns:
    - tuple: (model, X_test, y_test, y_pred_proba, class_labels)
    """
    # Drop unwanted columns from feature set
    X = data[model_features]
    X_test = data2[model_features]
    # Select only numeric features
    X = X.select_dtypes(include=['int32', 'float32'])
    X_test = X_test.select_dtypes(include=['int32', 'float32'])

    # Encode target labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(data[target_col])
    y_encoded2 = le.transform(data2[target_col])
    class_labels = [str(label) for label in le.classes_]

    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("multi_class", "multinomial")
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("max_iter", 100)
    mlflow.log_param("random_state", random_state)

    # Train multinomial logistic regression with class weighting
    model = LogisticRegression(
            multi_class='multinomial',
            class_weight='balanced',  # Handle class imbalance
            max_iter=100,  # Increased from 100 to improve convergence
            random_state=random_state
        )
    model.fit(X, y_encoded)

    mlflow.set_tag("model", "Logistic Regression with balanced classes AND OOT TEST SET (Cohort 2)")


    # Get predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    # Log metrics

        # Log metrics for each class individually
    classification_report_dict = classification_report(y_encoded2, y_pred, target_names=class_labels, output_dict=True)

    for class_label in class_labels:
        precision = classification_report_dict[class_label]["precision"]
        recall = classification_report_dict[class_label]["recall"]
        f1_score = classification_report_dict[class_label]["f1-score"]

    # Log metrics for the current class
        mlflow.log_metric(f"{class_label}_precision", precision)
        mlflow.log_metric(f"{class_label}_recall", recall)
        mlflow.log_metric(f"{class_label}_f1_score", f1_score)

    # Optionally, log the overall weighted average metrics
    weighted_avg = classification_report_dict["weighted avg"]
    mlflow.log_metric("weighted_precision", weighted_avg["precision"])
    mlflow.log_metric("weighted_recall", weighted_avg["recall"])
    mlflow.log_metric("weighted_f1_score", weighted_avg["f1-score"])

 
    # Plot and log confusion matrix
    cm_path = plot_confusion_matrix(y_encoded2, y_pred, class_labels)
    mlflow.log_artifact(cm_path)
    os.remove(cm_path)

    # Plot and log ROC curves
    roc_path = plot_roc_curves(y_encoded2, y_pred_proba, class_labels)
    mlflow.log_artifact(roc_path)
    os.remove(roc_path)

    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model")

    return model, X_test, y_encoded2, y_pred_proba, class_labels

def plot_confusion_matrix(y_test, y_pred, class_labels):
    """
    Plot confusion matrix with improved visualization and save it as an artifact.
    """
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_labels, yticklabels=class_labels,
                annot_kws={"size": 12})  # Improved font size
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(fontsize=10)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    cm_plot_path = "confusion_matrix.png"
    plt.savefig(cm_plot_path)
    plt.close()
    return cm_plot_path

def plot_roc_curves(y_test, y_pred_proba, class_labels):
    """
    Plot ROC curves for each class and save it as an artifact.
    """
    n_classes = len(class_labels)
    y_test_bin = np.eye(n_classes)[y_test]
    fpr, tpr, roc_auc = {}, {}, {}

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(10, 8))
    colors = cycle(['blue', 'red', 'green'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'Class {class_labels[i]} (AUC = {roc_auc[i]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curves')
    plt.legend(loc="lower right")
    plt.tight_layout()
    roc_plot_path = "roc_curves.png"
    plt.savefig(roc_plot_path)
    plt.close()
    return roc_plot_path

def log_top_customers_by_class(y_pred_proba, data, class_labels, top_n=10):
        
        top_customers_by_class = {}
        for class_idx, class_label in enumerate(class_labels):
            # Get the probabilities for the current class
            class_probabilities = y_pred_proba[:, class_idx]

            # Get the indices of the top N probabilities
            top_indices = np.argsort(class_probabilities)[-top_n:][::-1]

            # Extract the corresponding rows from the dataset
            top_customers = data.iloc[top_indices].copy()
            top_customers['Probability'] = class_probabilities[top_indices]

            # Keep only the UNIQUE_CONSUMER_KEY column and Probability
            if 'UNIQUE_CONSUMER_KEY' in top_customers.columns:
                top_customers = top_customers[['UNIQUE_CONSUMER_KEY', 'Probability']]
            else:
                raise KeyError("The column 'UNIQUE_CONSUMER_KEY' is not found in the dataset.")

            # Save the table as a CSV file
            csv_filename = f"top_{top_n}_customers_class_{class_label}.csv"
            top_customers.to_csv(csv_filename, index=False)

            top_customers_by_class[class_label] = top_customers


            # Log the CSV file to MLflow
            mlflow.log_artifact(csv_filename)

            # Clean up the file after logging
            os.remove(csv_filename)

            print(f"Logged top {top_n} customers for class {class_label} to MLflow.")

        return top_customers_by_class

def log_X_test_to_mlflow(X_test, artifact_name="X_test.csv"):
    """
    Save X_test as a CSV file and log it as an artifact in MLflow.

    Parameters:
    - X_test: The test dataset to save.
    - artifact_name: The name of the file to save in MLflow.
    """
    # Save X_test to a CSV file
    X_test_path = artifact_name
    X_test.to_csv(X_test_path, index=False)

    # Log the file as an artifact in MLflow
    mlflow.log_artifact(X_test_path)

    # Remove the local file after logging
    os.remove(X_test_path)

## Logistic Regression with balanced classes AND OOT TEST SET (Cohort 3)

In [26]:
#Training on cohort 1 and testing on cohort 3

# Run the analysis



with mlflow.start_run():
    model, X_test, y_test, y_pred_proba, class_labels = build_evaluate_multinomial_model(data, data3, features)

    # Get predictions for confusion matrix
    y_pred = model.predict(X_test)

    # Plot confusion matrix
    plot_confusion_matrix(y_test, y_pred, class_labels)

    # Plot ROC curves
    plot_roc_curves(y_test, y_pred_proba, class_labels)

    log_top_customers_by_class(y_pred_proba, data3, class_labels, top_n=10)

    log_X_test_to_mlflow(X_test)




STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logged top 10 customers for class 0 to MLflow.
Logged top 10 customers for class 1 to MLflow.
Logged top 10 customers for class 2 to MLflow.
🏃 View run defiant-asp-987 at: http://127.0.0.1:5000/#/experiments/339341380902033810/runs/caaa5590dece40a68d762c8f5ca3039e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/339341380902033810


In [37]:
y_pred_proba.shape

(240895, 3)

In [38]:
y_pred_proba #use argmax like if we have [[0.005, 0.05,, 0.93]] so 0.93 is class 2. 

#2WVBcswEYEUua70lsoSlD	

array([[0.00517091, 0.0568034 , 0.93802569],
       [0.00517091, 0.0568034 , 0.93802569],
       [0.00517091, 0.0568034 , 0.93802569],
       ...,
       [0.25531846, 0.01661859, 0.72806295],
       [0.68692887, 0.00742577, 0.30564536],
       [0.62436708, 0.00452956, 0.37110335]], shape=(240895, 3))

In [39]:
# Example: y_pred_proba is a 2D array where each row corresponds to a sample
# and each column corresponds to the probability of a class.
# y_pred_proba = [[0.1, 0.3, 0.6], [0.7, 0.2, 0.1], ...]

# Find the class with the maximum probability for each sample
max_probabilities = np.max(y_pred_proba, axis=1)  # Maximum probability in each row
predicted_classes = np.argmax(y_pred_proba, axis=1)  # Index of the maximum probability (class)

# Combine the results into a list of tuples (class, probability)
results = list(zip(predicted_classes, max_probabilities))

# Print the results
for i, (predicted_class, probability) in enumerate(results):
    print(f"Sample {i}: Predicted Class = {predicted_class}, Probability = {probability:.4f}")

Sample 0: Predicted Class = 2, Probability = 0.9380
Sample 1: Predicted Class = 2, Probability = 0.9380
Sample 2: Predicted Class = 2, Probability = 0.9380
Sample 3: Predicted Class = 2, Probability = 0.9380
Sample 4: Predicted Class = 2, Probability = 0.9380
Sample 5: Predicted Class = 2, Probability = 0.9380
Sample 6: Predicted Class = 2, Probability = 0.7676
Sample 7: Predicted Class = 2, Probability = 0.8608
Sample 8: Predicted Class = 0, Probability = 0.9038
Sample 9: Predicted Class = 0, Probability = 0.8356
Sample 10: Predicted Class = 0, Probability = 0.8415
Sample 11: Predicted Class = 0, Probability = 0.8321
Sample 12: Predicted Class = 0, Probability = 0.8321
Sample 13: Predicted Class = 0, Probability = 0.6701
Sample 14: Predicted Class = 0, Probability = 0.8933
Sample 15: Predicted Class = 0, Probability = 0.7068
Sample 16: Predicted Class = 0, Probability = 0.8455
Sample 17: Predicted Class = 0, Probability = 0.9136
Sample 18: Predicted Class = 0, Probability = 0.8366
Sam

KeyboardInterrupt: 

In [27]:
# Set the experiment name
experiment_name = "Multinomial Models Final"

# Get the experiment ID
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id

# Search for all runs in the experiment
runs = mlflow.search_runs(experiment_ids=[experiment_id])

# Display the run IDs
print(runs[['run_id', 'status', 'start_time']])

                              run_id    status  \
0   caaa5590dece40a68d762c8f5ca3039e  FINISHED   
1   4d1299101456477086742540c3d9eee3  FINISHED   
2   375085968cd846bab1787ff22ba2c035  FINISHED   
3   ad8bb03f1a194576ba778569422e6e0f  FINISHED   
4   e101fb9dde57496cafdf5cc157240e47    FAILED   
5   91367686997e4ecbb659ef4a08fe30c7    FAILED   
6   700d37aa7c2542c19f0fd0405dca15a8    FAILED   
7   830f2dd31a5a495ab86beed14c4cab2a  FINISHED   
8   d47caf8a5a5f49cf9495a5917ea26b29  FINISHED   
9   eb51967bc9854cfa9267388adc369a54  FINISHED   
10  fad4134a0f1348708ce275598ddb9772  FINISHED   
11  ab4c306f409b412fb34343441dd273bc    FAILED   
12  bcb12234fd3d4a39ba6cab53d3fef998  FINISHED   
13  324492bdabd14e7a81222fb34fc88f73  FINISHED   
14  7c85d1f1fb804631aca5086b5e571e66    FAILED   

                         start_time  
0  2025-04-23 02:10:37.740000+00:00  
1  2025-04-23 02:09:02.591000+00:00  
2  2025-04-23 02:07:47.293000+00:00  
3  2025-04-23 02:06:03.333000+00:00  
4  2025-0

In [80]:
# Specify the model URI (adjust the path if needed)
model_uri = "runs:/d47caf8a5a5f49cf9495a5917ea26b29/logistic_regression_model"  # Replace <run_id> with the actual run ID

# Load the model
loaded_model = mlflow.sklearn.load_model(model_uri)

# Use the loaded model for predictions
# Assuming `X_test` is already prepared
y_pred_proba = loaded_model.predict_proba(X_test)

# Find the class with the maximum probability for each sample
max_probabilities = np.max(y_pred_proba, axis=1)  # Maximum probability in each row
predicted_classes = np.argmax(y_pred_proba, axis=1)  # Index of the maximum probability (class)

# Combine the results into a list of tuples (class, probability)
results = list(zip(predicted_classes, max_probabilities))

# Print the results
for i, (predicted_class, probability) in enumerate(results):
    print(f"Sample {i}: Predicted Class = {predicted_class}, Probability = {probability:.4f}")

Sample 0: Predicted Class = 2, Probability = 0.9380
Sample 1: Predicted Class = 2, Probability = 0.9380
Sample 2: Predicted Class = 2, Probability = 0.9380
Sample 3: Predicted Class = 2, Probability = 0.9380
Sample 4: Predicted Class = 2, Probability = 0.9380
Sample 5: Predicted Class = 2, Probability = 0.9380
Sample 6: Predicted Class = 2, Probability = 0.7676
Sample 7: Predicted Class = 2, Probability = 0.8608
Sample 8: Predicted Class = 0, Probability = 0.9038
Sample 9: Predicted Class = 0, Probability = 0.8356
Sample 10: Predicted Class = 0, Probability = 0.8415
Sample 11: Predicted Class = 0, Probability = 0.8321
Sample 12: Predicted Class = 0, Probability = 0.8321
Sample 13: Predicted Class = 0, Probability = 0.6701
Sample 14: Predicted Class = 0, Probability = 0.8933
Sample 15: Predicted Class = 0, Probability = 0.7068
Sample 16: Predicted Class = 0, Probability = 0.8455
Sample 17: Predicted Class = 0, Probability = 0.9136
Sample 18: Predicted Class = 0, Probability = 0.8366
Sam

KeyboardInterrupt: 

In [3]:
# Set the experiment name
experiment_name = "Multinomial Models Final - Group"

# Get the experiment ID
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id

# Search for all runs in the experiment
runs = mlflow.search_runs(experiment_ids=[experiment_id])

# Display the run IDs
print(runs[['run_id', 'status', 'start_time']])

                              run_id    status  \
0   9cfe49c083d7493ab7c0fa68027d5264  FINISHED   
1   890f3f731169421f94d95b90aa995053    FAILED   
2   bb0842d15d694638b3f02e26ada7f6df    FAILED   
3   15e2cbf5a5a243da9a20829b937ca1ae  FINISHED   
4   3b00c846ed6a4bc681a09e00cc5d5e9b    FAILED   
5   eed19412f5bd40629078624d8c09e619    FAILED   
6   1df5c93b97e345fab3a9bb6047ef8230  FINISHED   
7   31c01101b7d9409a85ccbc93e77efc6d  FINISHED   
8   ad4879d6a4cf42b6ac52d7c622aee488    FAILED   
9   c8623bf3518a4583bf475dc0a2a4faa4  FINISHED   
10  631b7b1abe0c4b6597f3c702312ba5fb  FINISHED   

                         start_time  
0  2025-04-21 23:35:36.588000+00:00  
1  2025-04-21 23:34:02.252000+00:00  
2  2025-04-21 23:32:00.331000+00:00  
3  2025-04-21 23:18:50.418000+00:00  
4  2025-04-21 23:18:29.410000+00:00  
5  2025-04-21 23:16:52.586000+00:00  
6  2025-04-21 23:09:01.494000+00:00  
7  2025-04-21 23:02:17.139000+00:00  
8  2025-04-21 23:00:31.794000+00:00  
9  2025-04-21 22:57

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
