# Imports

In [1]:
# General
import os
import gc
import json
from datetime import datetime, timedelta

# Data manipulation and processing
import pandas as pd
import numpy as np
from pandas.tseries.offsets import MonthEnd
from numpy import nanmean

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
from colorama import Fore, Style, init

# Modeling
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import torch
import shap
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import (
    f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
)




In [2]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [3]:
os.chdir('c:\\Users\\samema\\DB\\LV_Data2\\')

In [4]:
df_full = pd.read_csv('df_full_2024_09_27.csv', )
df_test = pd.read_csv('df_full_test_2024_09_27.csv', )
df_30 = pd.read_csv('df_30_day_2024_09_27.csv', )
df_60 = pd.read_csv('df_60_day_2024_09_27.csv', )
df_90 = pd.read_csv('df_90_day_2024_09_27.csv', )

In [5]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
import warnings
from sklearn.exceptions import DataConversionWarning

class FeatureScaler:
    def __init__(self, train, target, test=None, exclude_columns=None):
        """
        Initializes the FeatureScaler with training data, target column, and optionally test data.
        
        Parameters:
            train (pd.DataFrame): Training data.
            target (str): The name of the target column.
            test (pd.DataFrame, optional): Test data. Default is None.
            exclude_columns (list of str, optional): Columns to exclude from scaling. Default is None.
        """
        self.train = train
        self.test = test
        self.target = target
        self.exclude_columns = exclude_columns if exclude_columns is not None else []
        self.scaler = RobustScaler()

    def scale_features(self):
        """
        Scales the features using RobustScaler, excluding the target variable and any specified columns.
        If a test DataFrame is provided, it scales the test data using the same parameters.
        """
        features = [col for col in self.train.columns if col != self.target and col not in self.exclude_columns]
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always", DataConversionWarning)
                
                self.train[features] = self.scaler.fit_transform(self.train[features])
                
                if self.test is not None:
                    self.test[features] = self.scaler.transform(self.test[features])
                
                # Check for DataConversionWarnings
                if any(issubclass(warning.category, DataConversionWarning) for warning in w):
                    print("Data conversion issue encountered during scaling.")
                
                print("Features scaled successfully.")
        except Exception as e:
            print("An error occurred during scaling:", e)

# Example usage
# Assuming df_full, df_test, df_30, df_60, and df_90 are your DataFrames and 'encoded_category' is your target column name

# Scale features for full data with test set, excluding the 'date' column
feature_scaler_with_test = FeatureScaler(df_full, 'category', df_test, exclude_columns=['date'])
feature_scaler_with_test.scale_features()

# Scale features for 30-day data, excluding the 'date' column
feature_scaler30 = FeatureScaler(df_30, 'category', exclude_columns=['date','Unnamed: 0'])
feature_scaler30.scale_features()

# Scale features for 60-day data, excluding the 'date' column
feature_scaler60 = FeatureScaler(df_60, 'category', exclude_columns=['date','Unnamed: 0'])
feature_scaler60.scale_features()

# Scale features for 90-day data, excluding the 'date' column
feature_scaler90 = FeatureScaler(df_90, 'category', exclude_columns=['date','Unnamed: 0'])
feature_scaler90.scale_features()


Features scaled successfully.
Features scaled successfully.
Features scaled successfully.
Features scaled successfully.


In [6]:
# Function to combine minimal_risk and low_risk into low_risk
def combine_risk_categories(df):
    df['category'] = df['category'].replace('minimal_risk', 'low_risk')
    return df

# Apply the function to each DataFrame
df_full = combine_risk_categories(df_full)
df_30 = combine_risk_categories(df_30)
df_60 = combine_risk_categories(df_60)
df_90 = combine_risk_categories(df_90)


In [7]:
from sklearn.preprocessing import OrdinalEncoder

# Step 1: Ordinal Encoding
risk_categories = ['low_risk', 'low_medium_risk', 'medium_risk', 'medium_high_risk', 'high_risk']
encoder = OrdinalEncoder(categories=[risk_categories])

def encode_category(df):
    df['encoded_category'] = encoder.fit_transform(df[['category']])
    return df

df_full = encode_category(df_full)
df_test = encode_category(df_test)
df_30 = encode_category(df_30)
df_60 = encode_category(df_60)
df_90 = encode_category(df_90)

# Organize datasets into a dictionary
datasets = {
    'full_data': {'train': df_full, 'test': df_test},
    '30_day_data': {'train': df_30, 'test': df_test},
    '60_day_data': {'train': df_60, 'test': df_test},
    '90_day_data': {'train': df_90, 'test': df_test}
}


In [8]:
import pandas as pd

# Define the manual category mapping
category_mapping = {
    0.0: 0,
    4.0: 1,
    2.0: 1,
    1.0: 1,
    3.0: 1
}

# Function to apply category mapping
def apply_manual_category_mapping(df, category_mapping):
    df['merged_category'] = df['encoded_category'].map(category_mapping)
    return df

# Apply category mappings to all datasets
datasets = {
    'full_data': {'train': apply_manual_category_mapping(df_full, category_mapping), 'test': apply_manual_category_mapping(df_test, category_mapping)},
    '30_day_data': {'train': apply_manual_category_mapping(df_30, category_mapping), 'test': apply_manual_category_mapping(df_test, category_mapping)},
    '60_day_data': {'train': apply_manual_category_mapping(df_60, category_mapping), 'test': apply_manual_category_mapping(df_test, category_mapping)},
    '90_day_data': {'train': apply_manual_category_mapping(df_90, category_mapping), 'test': apply_manual_category_mapping(df_test, category_mapping)}
}

# Verify the merged categories
for key, dataset in datasets.items():
    print(f"\n{key} - Train Merged Categories:")
    print(dataset['train']['merged_category'].value_counts())
    print(f"{key} - Test Merged Categories:")
    print(dataset['test']['merged_category'].value_counts())



full_data - Train Merged Categories:
merged_category
1    7210
0    6716
Name: count, dtype: int64
full_data - Test Merged Categories:
merged_category
1    1030
0     478
Name: count, dtype: int64

30_day_data - Train Merged Categories:
merged_category
1    7210
0    6716
Name: count, dtype: int64
30_day_data - Test Merged Categories:
merged_category
1    1030
0     478
Name: count, dtype: int64

60_day_data - Train Merged Categories:
merged_category
1    7210
0    6716
Name: count, dtype: int64
60_day_data - Test Merged Categories:
merged_category
1    1030
0     478
Name: count, dtype: int64

90_day_data - Train Merged Categories:
merged_category
1    7210
0    6716
Name: count, dtype: int64
90_day_data - Test Merged Categories:
merged_category
1    1030
0     478
Name: count, dtype: int64


In [9]:
# Drop columns function
def drop_columns(datasets):
    for key, dataset in datasets.items():
        # Dropping columns and ensuring the dataset is updated
        dataset['train'] = dataset['train'].drop(columns=['category', 'date', 'Unnamed: 0','encoded_category'], errors='ignore')
        dataset['test'] = dataset['test'].drop(columns=['category', 'date', 'Unnamed: 0', 'encoded_category'], errors='ignore')
    return datasets



datasets = drop_columns(datasets)

In [10]:
import os
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError

# Set the environment variable SLACK_BOT_TOKEN to your actual Slack bot token

# Initialize the Slack client
client = WebClient(token=os.environ['SLACK_BOT_TOKEN'])

# Function to send a message
def send_slack_message(channel_id, message):
    try:
        response = client.chat_postMessage(
            channel=channel_id,
            text=message
        )
        print(f"Message sent to channel {channel_id}: {response['message']['text']}")
    except SlackApiError as e:
        print(f"Error posting message: {e.response['error']}")

# Replace 'your-channel-id' with the actual channel ID
channel_id = '#hyperparametertuning'
message = 'Hello from your bot! :tada:'

# Send the message
send_slack_message(channel_id, message)

Message sent to channel #hyperparametertuning: Hello from your bot! :tada:


In [31]:
import os
import json
import joblib
import logging
import pandas as pd
import numpy as np
import optuna
from datetime import datetime
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import (
    mean_squared_error, r2_score, accuracy_score, precision_score, recall_score,
    fbeta_score, confusion_matrix
)
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
import shap
from typing import Dict, List

# Set up logging
logging.basicConfig(level=logging.INFO)

def send_slack_message(channel_id: str, message: str):
    client = WebClient(token=os.environ['SLACK_BOT_TOKEN'])
    try:
        response = client.chat_postMessage(channel=channel_id, text=message)
        logging.info(f"Message sent to channel {channel_id}: {response['message']['text']}")
    except SlackApiError as e:
        logging.error(f"Error posting message: {e.response['error']}")

class XGBoostPredictor:
    def __init__(self, datasets: Dict[str, Dict[str, pd.DataFrame]], model_configs: dict, 
                 config_file: str = 'best_config.json', model_dir: str = 'models', slack_channel: str = None):
        self.datasets = datasets
        self.model_configs = model_configs
        self.config_file = config_file
        self.model_dir = model_dir
        self.slack_channel = slack_channel
        self.model_results = {}
        self.ensemble_results = {}
        self.best_params = {}
        self.accumulated_X = pd.DataFrame()
        self.accumulated_y = pd.Series(dtype=float)
        self.shap_values_dict = {}

        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        if os.path.exists(config_file):
            os.remove(config_file)

    def compute_metrics(self, y_true: pd.Series, y_pred: pd.Series, threshold: float) -> Dict[str, float]:
        y_true_binary = (y_true >= threshold).astype(int)
        y_pred_binary = (y_pred >= threshold).astype(int)
        return {
            'Accuracy': accuracy_score(y_true_binary, y_pred_binary),
            'Precision': precision_score(y_true_binary, y_pred_binary, average='binary', zero_division=0),
            'Recall': recall_score(y_true_binary, y_pred_binary, average='binary', zero_division=0),
            'F0.5 Score': fbeta_score(y_true_binary, y_pred_binary, beta=0.5, average='binary', zero_division=0)
        }

    def objective_f0_5(self, trial: optuna.Trial) -> float:
        logging.info(f"Starting trial {trial.number} for F0.5 optimization")

        xgb_params = {
            'learning_rate': trial.suggest_float('xgb_learning_rate', 1e-4, 1e-1, log=True),
            'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 500),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 12),
            'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('xgb_reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('xgb_reg_lambda', 1e-8, 1.0, log=True),
            'tree_method': 'hist',
            'device': 'cuda',
            'early_stopping_rounds': 50
        }

        threshold = trial.suggest_float('threshold', 0.1, 0.9)

        outer_kf = KFold(n_splits=5, shuffle=True, random_state=42)
        outer_fold_metrics = []

        for train_index, test_index in outer_kf.split(self.accumulated_X):
            X_train, X_test = self.accumulated_X.iloc[train_index], self.accumulated_X.iloc[test_index]
            y_train, y_test = self.accumulated_y.iloc[train_index], self.accumulated_y.iloc[test_index]

            inner_kf = KFold(n_splits=3, shuffle=True, random_state=42)
            inner_fold_metrics = []

            for inner_train_index, val_index in inner_kf.split(X_train):
                X_inner_train, X_val = X_train.iloc[inner_train_index], X_train.iloc[val_index]
                y_inner_train, y_val = y_train.iloc[inner_train_index], y_train.iloc[val_index]

                model = XGBClassifier(**xgb_params)
                model.fit(X_inner_train, y_inner_train, eval_set=[(X_val, y_val)], verbose=False)

                y_val_pred = model.predict_proba(X_val)[:, 1]  # Predict probabilities for the positive class
                metrics = self.compute_metrics(y_val, y_val_pred, threshold)
                inner_fold_metrics.append(metrics)

            avg_fbeta = np.nanmean([fm['F0.5 Score'] for fm in inner_fold_metrics if fm['F0.5 Score'] is not None])
            outer_fold_metrics.append(avg_fbeta)

        trial_fbeta = np.mean(outer_fold_metrics)
        logging.info(f"Trial {trial.number} completed with average F0.5 Score: {trial_fbeta}")

        if trial.number % 100 == 0:  # Slack message every 100 trials to avoid spamming
            message = f"Trial {trial.number} completed with average F0.5 Score: {trial_fbeta}"
            if self.slack_channel:
                send_slack_message(self.slack_channel, message)

        return trial_fbeta

    def train_and_evaluate(self) -> pd.DataFrame:
        ordered_datasets = ['30_day_data', '60_day_data', '90_day_data', 'full_data']

        # Create study once, not repeatedly
        study_f0_5 = optuna.create_study(direction='maximize')

        for truncation_label in ordered_datasets:
            dataset = self.datasets[truncation_label]
            X, y = dataset['train'].drop(columns=['merged_category'], errors='ignore'), dataset['train']['merged_category']
            self.accumulated_X = pd.concat([self.accumulated_X, X], ignore_index=True)
            self.accumulated_y = pd.concat([self.accumulated_y, y], ignore_index=True)

            if truncation_label not in self.best_params:
                # Optimize on the same study
                study_f0_5.optimize(self.objective_f0_5, n_trials=1000, n_jobs=1)
                best_params_f0_5 = study_f0_5.best_trial.params
                self.best_params[f"{truncation_label}_f0_5"] = best_params_f0_5
                self.save_best_params()

            self.fit_and_save_model(truncation_label, X, y)
            self.calculate_shap_values_for_model(truncation_label)

        self.save_shap_values()
        return self.summarize_metrics()

    def fit_and_save_model(self, truncation_label: str, X: pd.DataFrame, y: pd.Series):
        best_params_f0_5 = self.best_params[f"{truncation_label}_f0_5"]
        model_f0_5 = XGBClassifier(**self._build_model_params(best_params_f0_5))
        self.fit_model_and_save(model_f0_5, X, y, f"{truncation_label}_f0_5")

    def fit_model_and_save(self, model, X: pd.DataFrame, y: pd.Series, model_key: str):
        if model_key not in self.best_params:
            raise ValueError(f"Best parameters for model {model_key} not found.")
        
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Split the training data further into training and validation sets
            X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

            # Fit the model with validation set
            model.fit(
                X_train_split, 
                y_train_split, 
                eval_set=[(X_val_split, y_val_split)],
                verbose=False
            )

            # Predict on the test set
            y_pred = model.predict_proba(X_test)[:, 1]  # Predict probabilities for the positive class

            # Get the threshold from best_params
            threshold = self.best_params[model_key]['threshold']

            # Compute metrics using the threshold from best_params
            metrics = self.compute_metrics(y_test, y_pred, threshold)
            self.model_results[model_key] = model

        self.save_model(model, model_key)

    def calculate_shap_values_for_model(self, truncation_label: str):
        model = self.model_results[f"{truncation_label}_f0_5"]
        X_test = self.accumulated_X

        # SHAP value calculation (keeping your original approach)
        self.calculate_shap_values(model, X_test, truncation_label, 'f0_5')

    def calculate_shap_values(self, model: XGBClassifier, X: pd.DataFrame, truncation_label: str, model_type: str):
        explainer = shap.Explainer(model)
        shap_values = explainer(X)
        self.shap_values_dict[f"{truncation_label}_{model_type}"] = {
            'shap_values': shap_values.values.tolist(),
            'feature_names': X.columns.tolist()
        }

    def _build_model_params(self, params: dict) -> dict:
        return {
            'learning_rate': params['xgb_learning_rate'],
            'n_estimators': params['xgb_n_estimators'],
            'max_depth': params['xgb_max_depth'],
            'subsample': params['xgb_subsample'],
            'colsample_bytree': params['xgb_colsample_bytree'],
            'reg_alpha': params['xgb_reg_alpha'],
            'reg_lambda': params['xgb_reg_lambda'],
            'tree_method': 'hist',
            'device': 'cuda',
            'early_stopping_rounds': 50
        }

    def save_best_params(self):
        with open(self.config_file, 'w') as f:
            json.dump(self.best_params, f)

    def save_model(self, model: XGBClassifier, model_key: str):
        joblib.dump(model, f"{self.model_dir}/{model_key}.joblib")
        
    def save_shap_values(self):
        # Save SHAP values to a JSON file
        shap_values_path = os.path.join(self.model_dir, 'shap_values.json')
        with open(shap_values_path, 'w') as f:
            json.dump(self.shap_values_dict, f)
        logging.info(f"SHAP values saved to {shap_values_path}")

    def summarize_metrics(self):
        summary_data = []
        for key, model in self.model_results.items():
            truncation_label = key.split('_')[0]
            # Model prediction to compute metrics
            y_pred = model.predict_proba(self.accumulated_X)[:, 1]
            metrics = self.compute_metrics(self.accumulated_y, y_pred, self.best_params[key]['threshold'])
            summary_data.append({
                'Model': key,
                'Truncation_Label': truncation_label,
                'Accuracy': metrics.get('Accuracy'),
                'Precision': metrics.get('Precision'),
                'Recall': metrics.get('Recall'),
                'F0.5 Score': metrics.get('F0.5 Score'),
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_csv(os.path.join(self.model_dir, 'summary_metrics.csv'), index=False)
        return summary_df

    def evaluate_on_holdout(self, df_test: pd.DataFrame) -> pd.DataFrame:
        if df_test.empty:
            logging.warning("The test DataFrame is empty.")
            return pd.DataFrame()

        # Prepare test data
        X_test = df_test.drop(columns=['merged_category'], errors='ignore')
        y_test = df_test['merged_category']

        if X_test.empty or y_test.empty:
            logging.warning("X_test or y_test is empty after dropping columns.")
            return pd.DataFrame()

        if not self.model_results:
            logging.warning("No models found in self.model_results.")
            return pd.DataFrame()

        holdout_metrics = {}

        for model_key, model in self.model_results.items():
            # Predict probabilities for positive class
            y_pred = model.predict_proba(X_test)[:, 1]

            # Retrieve threshold for the model
            threshold = self.best_params[model_key]['threshold']

            # Compute metrics
            metrics = self.compute_metrics(y_test, y_pred, threshold)
            holdout_metrics[model_key] = metrics

        # Convert metrics to DataFrame and save
        results_df = pd.DataFrame.from_dict(holdout_metrics, orient='index')
        results_df.to_csv(os.path.join(self.model_dir, 'holdout_results.csv'), index=False)

        if not results_df.empty:
            logging.info("Holdout evaluation completed successfully.")
            if self.slack_channel:
                send_slack_message(self.slack_channel, "Holdout evaluation completed successfully.")

        return results_df


# SHAP functions
def average_shap_values(shap_values_dict):
    average_shap_values_dict = {}
    for truncation_label, shap_info in shap_values_dict.items():
        shap_vals = np.array(shap_info['shap_values'])
        avg_shap_vals = np.mean(np.abs(shap_vals), axis=(0, 2)) if shap_vals.ndim == 3 else np.mean(np.abs(shap_vals), axis=0)
        print(f"SHAP values shape for {truncation_label}: {shap_vals.shape}")
        print(f"Averaged SHAP values shape for {truncation_label}: {avg_shap_vals.shape}")
        truncation_label_simplified = "_".join(truncation_label.split("_")[:-1])
        if truncation_label_simplified in average_shap_values_dict:
            average_shap_values_dict[truncation_label_simplified].append(avg_shap_vals)
        else:
            average_shap_values_dict[truncation_label_simplified] = [avg_shap_vals]
    return average_shap_values_dict

def combine_and_average_shap_values(average_shap_values_dict, feature_names):
    combined_df = pd.DataFrame()
    for truncation_label, avg_shap_vals_list in average_shap_values_dict.items():
        avg_shap_vals_across_models = np.mean(avg_shap_vals_list, axis=0)
        print(f"Processing {truncation_label}:")
        print(f" - Averaged SHAP values shape across models: {avg_shap_vals_across_models.shape}")
        print(f" - Feature names length: {len(feature_names)}")
        if len(avg_shap_vals_across_models) != len(feature_names):
            print(f"Length mismatch for {truncation_label}: {len(avg_shap_vals_across_models)} SHAP values vs {len(feature_names)} features")
            continue
        truncation_df = pd.DataFrame({
            'Feature': feature_names,
            'Mean SHAP Value': avg_shap_vals_across_models,
            'Truncation Label': truncation_label
        })
        combined_df = pd.concat([combined_df, truncation_df], ignore_index=True)
    return combined_df

def plot_combined_shap_values(combined_df):
    mean_shap_values = combined_df.groupby('Feature')['Mean SHAP Value'].mean().sort_values(ascending=False).reset_index()
    mean_shap_values = mean_shap_values[::-1]

    plt.figure(figsize=(14, 10))
    for label in combined_df['Truncation Label'].unique():
        subset = combined_df[combined_df['Truncation Label'] == label]
        subset_sorted = subset.set_index('Feature').loc[mean_shap_values['Feature']].reset_index()
        plt.scatter(subset_sorted['Mean SHAP Value'], subset_sorted['Feature'], label=label, alpha=0.7, s=100)
    
    plt.xlabel('Mean SHAP Value')
    plt.ylabel('Feature')
    plt.title('Average SHAP Values for Different Truncation Labels')
    plt.legend(title='Truncation Label')
    plt.tight_layout()
    plt.show()

In [17]:
import os
import json
import joblib
import logging
import pandas as pd
import numpy as np
import optuna
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import (
    mean_squared_error, r2_score, accuracy_score, precision_score, recall_score,
    fbeta_score, roc_auc_score, confusion_matrix
)
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# Set up logging
logging.basicConfig(level=logging.INFO)

def send_slack_message(channel_id, message):
    client = WebClient(token=os.environ['SLACK_BOT_TOKEN'])
    try:
        response = client.chat_postMessage(channel=channel_id, text=message)
        logging.info(f"Message sent to channel {channel_id}: {response['message']['text']}")
    except SlackApiError as e:
        logging.error(f"Error posting message: {e.response['error']}")

class XGBoostPredictor:
    def __init__(self, datasets, model_configs, config_file='best_config.json', model_dir='models', slack_channel=None):
        self.datasets = datasets
        self.model_configs = model_configs
        self.config_file = config_file
        self.model_dir = model_dir
        self.slack_channel = slack_channel
        self.model_results = {}
        self.ensemble_results = {}
        self.best_params = {}
        self.accumulated_X = pd.DataFrame()
        self.accumulated_y = pd.Series(dtype=float)
        self.shap_values_dict = {}
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        # Ensure a fresh start by removing existing best parameters file if it exists
        if os.path.exists(config_file):
            os.remove(config_file)

    def compute_metrics(self, y_true, y_pred, threshold):
        y_true_binary = (y_true >= threshold).astype(int)
        y_pred_binary = (y_pred >= threshold).astype(int)
        metrics = {
            'Mean Squared Error': mean_squared_error(y_true, y_pred),
            'R^2 Score': r2_score(y_true, y_pred),
            'Accuracy': accuracy_score(y_true_binary, y_pred_binary),
            'Precision': precision_score(y_true_binary, y_pred_binary, average='binary', zero_division=0),
            'Recall': recall_score(y_true_binary, y_pred_binary, average='binary', zero_division=0),
            'F0.5 Score': fbeta_score(y_true_binary, y_pred_binary, beta=1, average='binary', zero_division=0),
            'ROC AUC': roc_auc_score(y_true_binary, y_pred_binary)
        }
        return metrics

    def objective_f0_5(self, trial):
        logging.info(f"Starting trial {trial.number} for F0.5 optimization")

        xgb_params = {
            'learning_rate': trial.suggest_float('xgb_learning_rate', 1e-4, 1e-1, log=True),
            'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 500),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 12),
            'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('xgb_reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('xgb_reg_lambda', 1e-8, 1.0, log=True),
            'tree_method': 'hist',
            'device': 'cuda',
            'early_stopping_rounds': 50
        }

        threshold = trial.suggest_float('threshold', 0.1, 0.9)

        outer_kf = KFold(n_splits=5, shuffle=True, random_state=42)
        outer_fold_metrics = []

        for train_index, test_index in outer_kf.split(self.accumulated_X):
            X_train, X_test = self.accumulated_X.iloc[train_index], self.accumulated_X.iloc[test_index]
            y_train, y_test = self.accumulated_y.iloc[train_index], self.accumulated_y.iloc[test_index]

            inner_kf = KFold(n_splits=3, shuffle=True, random_state=42)
            inner_fold_metrics = []

            for inner_train_index, val_index in inner_kf.split(X_train):
                X_inner_train, X_val = X_train.iloc[inner_train_index], X_train.iloc[val_index]
                y_inner_train, y_val = y_train.iloc[inner_train_index], y_train.iloc[val_index]

                model = XGBRegressor(**xgb_params)
                model.fit(X_inner_train, y_inner_train, eval_set=[(X_val, y_val)], verbose=False)

                y_val_pred = model.predict(X_val)
                metrics = self.compute_metrics(y_val, y_val_pred, threshold)
                inner_fold_metrics.append(metrics)

            avg_fbeta = np.nanmean([fm['F0.5 Score'] for fm in inner_fold_metrics if fm['F0.5 Score'] is not None])
            outer_fold_metrics.append(avg_fbeta)

        trial_fbeta = np.mean(outer_fold_metrics)
        logging.info(f"Trial {trial.number} completed with average F0.5 Score: {trial_fbeta}")

        if trial.number % 10 == 0:
            message = f"Trial {trial.number} completed with average F0.5 Score: {trial_fbeta}"
            if self.slack_channel:
                send_slack_message(self.slack_channel, message)

        return trial_fbeta

    def objective_roc_auc(self, trial):
        logging.info(f"Starting trial {trial.number} for ROC AUC optimization")

        xgb_params = {
            'learning_rate': trial.suggest_float('xgb_learning_rate', 1e-4, 1e-1, log=True),
            'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 500),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 12),
            'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('xgb_reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('xgb_reg_lambda', 1e-8, 1.0, log=True),
            'tree_method': 'hist',
            'device': 'cuda',
            'early_stopping_rounds': 50
        }

        threshold = trial.suggest_float('threshold', 0.1, 0.9)

        outer_kf = KFold(n_splits=5, shuffle=True, random_state=42)
        outer_fold_metrics = []

        for train_index, test_index in outer_kf.split(self.accumulated_X):
            X_train, X_test = self.accumulated_X.iloc[train_index], self.accumulated_X.iloc[test_index]
            y_train, y_test = self.accumulated_y.iloc[train_index], self.accumulated_y.iloc[test_index]

            inner_kf = KFold(n_splits=3, shuffle=True, random_state=42)
            inner_fold_metrics = []

            for inner_train_index, val_index in inner_kf.split(X_train):
                X_inner_train, X_val = X_train.iloc[inner_train_index], X_train.iloc[val_index]
                y_inner_train, y_val = y_train.iloc[inner_train_index], y_train.iloc[val_index]

                model = XGBRegressor(**xgb_params)
                model.fit(X_inner_train, y_inner_train, eval_set=[(X_val, y_val)], verbose=False)

                y_val_pred = model.predict(X_val)
                metrics = self.compute_metrics(y_val, y_val_pred, threshold)
                inner_fold_metrics.append(metrics)

            avg_roc_auc = np.nanmean([fm['ROC AUC'] for fm in inner_fold_metrics if fm['ROC AUC'] is not None])
            outer_fold_metrics.append(avg_roc_auc)

        trial_roc_auc = np.mean(outer_fold_metrics)
        logging.info(f"Trial {trial.number} completed with average ROC AUC: {trial_roc_auc}")

        if trial.number % 10 == 0:
            message = f"Trial {trial.number} completed with average ROC AUC: {trial_roc_auc}"
            if self.slack_channel:
                send_slack_message(self.slack_channel, message)

        return trial_roc_auc

    def train_and_evaluate(self):
        ordered_datasets = ['30_day_data', '60_day_data', '90_day_data', 'full_data']

        for truncation_label in ordered_datasets:
            dataset = self.datasets[truncation_label]
            X = dataset['train'].drop(columns=['merged_category'], errors='ignore')
            y = dataset['train']['merged_category']

            self.accumulated_X = pd.concat([self.accumulated_X, X], ignore_index=True)
            self.accumulated_y = pd.concat([self.accumulated_y, y], ignore_index=True)

            if truncation_label not in self.best_params:
                # Optimize for F0.5 Score
                study_f0_5 = optuna.create_study(direction='maximize')
                study_f0_5.optimize(self.objective_f0_5, n_trials=1000, n_jobs=1)  # Limit to 1 job to avoid concurrent GPU issues
                best_trial_f0_5 = study_f0_5.best_trial
                best_params_f0_5 = best_trial_f0_5.params
                self.best_params[f"{truncation_label}_f0_5"] = best_params_f0_5

                # Optimize for ROC AUC
                study_roc_auc = optuna.create_study(direction='maximize')
                study_roc_auc.optimize(self.objective_roc_auc, n_trials=1000, n_jobs=1)  # Limit to 1 job to avoid concurrent GPU issues
                best_trial_roc_auc = study_roc_auc.best_trial
                best_params_roc_auc = best_trial_roc_auc.params
                self.best_params[f"{truncation_label}_roc_auc"] = best_params_roc_auc

                self.save_best_params()

            best_params_f0_5 = self.best_params[f"{truncation_label}_f0_5"]
            logging.info(f"Best trial params for {truncation_label} F0.5: {best_params_f0_5}")

            best_params_roc_auc = self.best_params[f"{truncation_label}_roc_auc"]
            logging.info(f"Best trial params for {truncation_label} ROC AUC: {best_params_roc_auc}")

            model_f0_5 = XGBRegressor(
                learning_rate=best_params_f0_5['xgb_learning_rate'],
                n_estimators=best_params_f0_5['xgb_n_estimators'],
                max_depth=best_params_f0_5['xgb_max_depth'],
                subsample=best_params_f0_5['xgb_subsample'],
                colsample_bytree=best_params_f0_5['xgb_colsample_bytree'],
                reg_alpha=best_params_f0_5['xgb_reg_alpha'],
                reg_lambda=best_params_f0_5['xgb_reg_lambda'],
                tree_method='hist',
                device='cuda',
                early_stopping_rounds=50
            )
            threshold_f0_5 = best_params_f0_5['threshold']

            model_roc_auc = XGBRegressor(
                learning_rate=best_params_roc_auc['xgb_learning_rate'],
                n_estimators=best_params_roc_auc['xgb_n_estimators'],
                max_depth=best_params_roc_auc['xgb_max_depth'],
                subsample=best_params_roc_auc['xgb_subsample'],
                colsample_bytree=best_params_roc_auc['xgb_colsample_bytree'],
                reg_alpha=best_params_roc_auc['xgb_reg_alpha'],
                reg_lambda=best_params_roc_auc['xgb_reg_lambda'],
                tree_method='hist',
                device='cuda',
                early_stopping_rounds=50
            )
            threshold_roc_auc = best_params_roc_auc['threshold']

            kf = KFold(n_splits=3, shuffle=True, random_state=42)
            fold_metrics_f0_5 = []
            fold_metrics_roc_auc = []

            for train_index, test_index in kf.split(self.accumulated_X):
                X_train, X_test = self.accumulated_X.iloc[train_index], self.accumulated_X.iloc[test_index]
                y_train, y_test = self.accumulated_y.iloc[train_index], self.accumulated_y.iloc[test_index]

                X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

                model_f0_5.fit(X_train_split, y_train_split, eval_set=[(X_val_split, y_val_split)], verbose=False)
                y_pred_f0_5 = model_f0_5.predict(X_test)
                metrics_f0_5 = self.compute_metrics(y_test, y_pred_f0_5, threshold_f0_5)
                fold_metrics_f0_5.append(metrics_f0_5)

                model_roc_auc.fit(X_train_split, y_train_split, eval_set=[(X_val_split, y_val_split)], verbose=False)
                y_pred_roc_auc = model_roc_auc.predict(X_test)
                metrics_roc_auc = self.compute_metrics(y_test, y_pred_roc_auc, threshold_roc_auc)
                fold_metrics_roc_auc.append(metrics_roc_auc)

                model_key_f0_5 = f"{truncation_label}_xgboost_f0_5_{datetime.now().strftime('%Y%m%d%H%M%S')}"
                self.model_results[model_key_f0_5] = model_f0_5

                model_key_roc_auc = f"{truncation_label}_xgboost_roc_auc_{datetime.now().strftime('%Y%m%d%H%M%S')}"
                self.model_results[model_key_roc_auc] = model_roc_auc

                self.save_model(model_f0_5, model_key_f0_5)
                self.save_model(model_roc_auc, model_key_roc_auc)

                # Calculate SHAP values for feature importance
                self.calculate_shap_values(model_f0_5, X_test, truncation_label, 'f0_5')
                self.calculate_shap_values(model_roc_auc, X_test, truncation_label, 'roc_auc')

            avg_metrics_f0_5 = {key: np.nanmean([fm[key] for fm in fold_metrics_f0_5 if fm[key] is not None]) for key in fold_metrics_f0_5[0]}
            self.ensemble_results[f"{truncation_label}_ensemble_f0_5"] = avg_metrics_f0_5

            avg_metrics_roc_auc = {key: np.nanmean([fm[key] for fm in fold_metrics_roc_auc if fm[key] is not None]) for key in fold_metrics_roc_auc[0]}
            self.ensemble_results[f"{truncation_label}_ensemble_roc_auc"] = avg_metrics_roc_auc

            if self.slack_channel:
                send_slack_message(self.slack_channel, f"Training completed for {truncation_label}")

        self.save_shap_values()
        return self.summarize_metrics()

    def calculate_shap_values(self, model, X, truncation_label, model_type):
        explainer = shap.Explainer(model)
        shap_values = explainer(X)
        self.shap_values_dict[f"{truncation_label}_{model_type}"] = {
            'shap_values': shap_values.values.tolist(),  # Convert ndarray to list
            'feature_names': X.columns.tolist()
        }

    def save_shap_values(self):
        with open(os.path.join(self.model_dir, 'shap_values.json'), 'w') as f:
            json.dump(self.shap_values_dict, f)

    def summarize_metrics(self):
        summary_data = []
        for key, metrics in self.ensemble_results.items():
            truncation_label, _ = key.rsplit('_', 1)
            model_type = truncation_label.split('_')[0]

            summary_data.append({
                'Model': model_type,
                'Truncation_Label': truncation_label,
                'Mean Squared Error': metrics.get('Mean Squared Error'),
                'R^2 Score': metrics.get('R^2 Score'),
                'Accuracy': metrics.get('Accuracy'),
                'Precision': metrics.get('Precision'),
                'Recall': metrics.get('Recall'),
                'F0.5 Score': metrics.get('F0.5 Score'),
                'ROC AUC': metrics.get('ROC AUC')
            })

        summary_df = pd.DataFrame(summary_data)
        summary_df.to_csv(os.path.join(self.model_dir, 'summary_metrics.csv'), index=False)
        return summary_df[['Model', 'Truncation_Label', 'Mean Squared Error', 'R^2 Score', 'Accuracy', 'Precision', 'Recall', 'F0.5 Score', 'ROC AUC']]

    def evaluate_on_holdout(self, df_test):
        if df_test.empty:
            logging.warning("The test DataFrame is empty.")
            return pd.DataFrame()

        X_test = df_test.drop(columns=['merged_category'], errors='ignore')
        y_test = df_test['merged_category']

        if X_test.empty or y_test.empty:
            logging.warning("X_test or y_test is empty after dropping columns.")
            return pd.DataFrame()

        if not self.model_results:
            logging.warning("No models found in self.model_results.")
            return pd.DataFrame()

        holdout_ensemble_results = {}

        for truncation_label, best_params in self.best_params.items():
            model = XGBRegressor(
                learning_rate=best_params['xgb_learning_rate'],
                n_estimators=best_params['xgb_n_estimators'],
                max_depth=best_params['xgb_max_depth'],
                subsample=best_params['xgb_subsample'],
                colsample_bytree=best_params['xgb_colsample_bytree'],
                reg_alpha=best_params['xgb_reg_alpha'],
                reg_lambda=best_params['xgb_reg_lambda'],
                tree_method='hist',
                device='cuda'
            )
            threshold = best_params['threshold']

            X_train_all = self.accumulated_X
            y_train_all = self.accumulated_y

            model.fit(X_train_all, y_train_all)

            y_pred = model.predict(X_test)

            metrics = self.compute_metrics(y_test, y_pred, threshold)
            holdout_ensemble_results[truncation_label] = metrics

            self.plot_confusion_matrix(y_test, y_pred, threshold)

        results_df = pd.DataFrame.from_dict(holdout_ensemble_results, orient='index')
        results_df.to_csv(os.path.join(self.model_dir, 'holdout_results.csv'), index=False)

        if not results_df.empty:
            logging.info("Holdout evaluation completed successfully.")
            if self.slack_channel:
                send_slack_message(self.slack_channel, "Holdout evaluation completed successfully.")

        return results_df

    def plot_confusion_matrix(self, y_true, y_pred, threshold):
        y_true_binary = (y_true >= threshold).astype(int)
        y_pred_binary = (y_pred >= threshold).astype(int)
        cm = confusion_matrix(y_true_binary, y_pred_binary)
        plt.figure(figsize=(10, 7))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.show()

    def save_best_params(self):
        with open(self.config_file, 'w') as f:
            json.dump(self.best_params, f)

    def save_model(self, model, model_key):
        joblib.dump(model, f"{self.model_dir}/{model_key}.joblib")

# SHAP functions
def average_shap_values(shap_values_dict):
    average_shap_values_dict = {}
    for truncation_label, shap_info in shap_values_dict.items():
        shap_vals = np.array(shap_info['shap_values'])
        avg_shap_vals = np.mean(np.abs(shap_vals), axis=(0, 2)) if shap_vals.ndim == 3 else np.mean(np.abs(shap_vals), axis=0)
        print(f"SHAP values shape for {truncation_label}: {shap_vals.shape}")
        print(f"Averaged SHAP values shape for {truncation_label}: {avg_shap_vals.shape}")
        truncation_label_simplified = "_".join(truncation_label.split("_")[:-1])
        if truncation_label_simplified in average_shap_values_dict:
            average_shap_values_dict[truncation_label_simplified].append(avg_shap_vals)
        else:
            average_shap_values_dict[truncation_label_simplified] = [avg_shap_vals]
    return average_shap_values_dict

def combine_and_average_shap_values(average_shap_values_dict, feature_names):
    combined_df = pd.DataFrame()
    for truncation_label, avg_shap_vals_list in average_shap_values_dict.items():
        avg_shap_vals_across_models = np.mean(avg_shap_vals_list, axis=0)
        print(f"Processing {truncation_label}:")
        print(f" - Averaged SHAP values shape across models: {avg_shap_vals_across_models.shape}")
        print(f" - Feature names length: {len(feature_names)}")
        if len(avg_shap_vals_across_models) != len(feature_names):
            print(f"Length mismatch for {truncation_label}: {len(avg_shap_vals_across_models)} SHAP values vs {len(feature_names)} features")
            continue
        truncation_df = pd.DataFrame({
            'Feature': feature_names,
            'Mean SHAP Value': avg_shap_vals_across_models,
            'Truncation Label': truncation_label
        })
        combined_df = pd.concat([combined_df, truncation_df], ignore_index=True)
    return combined_df

def plot_combined_shap_values(combined_df):
    mean_shap_values = combined_df.groupby('Feature')['Mean SHAP Value'].mean().sort_values(ascending=False).reset_index()
    mean_shap_values = mean_shap_values[::-1]

    plt.figure(figsize=(14, 10))
    for label in combined_df['Truncation Label'].unique():
        subset = combined_df[combined_df['Truncation Label'] == label]
        subset_sorted = subset.set_index('Feature').loc[mean_shap_values['Feature']].reset_index()
        plt.scatter(subset_sorted['Mean SHAP Value'], subset_sorted['Feature'], label=label, alpha=0.7, s=100)
    
    plt.xlabel('Mean SHAP Value')
    plt.ylabel('Feature')
    plt.title('Average SHAP Values for Different Truncation Labels')
    plt.legend(title='Truncation Label')
    plt.tight_layout()
    plt.show()


: 

In [32]:
def main():
    try:
        # Ensure the model directory is clean
        model_dir = 'models'
        for file in os.listdir(model_dir):
            if file.endswith('.joblib'):
                os.remove(os.path.join(model_dir, file))

        model_configs = {
            'xgb': {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 6}
        }

        slack_channel = '#hyperparametertuning'

        # Initialize and run the predictor
        predictor = XGBoostPredictor(datasets, model_configs, slack_channel=slack_channel)
        train_metrics = predictor.train_and_evaluate()
        holdout_metrics = predictor.evaluate_on_holdout(datasets['full_data']['test'])

        # Save training metrics to a JSON file with a unique identifier
        train_metrics_file = os.path.join(predictor.model_dir, f"train_metrics_{datetime.now().strftime('%Y%m%d%H%M%S')}.json")
        train_metrics.to_json(train_metrics_file, orient='records', lines=True)
        print(f"Training metrics saved to {train_metrics_file}")

        # Save holdout metrics to a JSON file with a unique identifier
        holdout_metrics_file = os.path.join(predictor.model_dir, f"holdout_metrics_{datetime.now().strftime('%Y%m%d%H%M%S')}.json")
        holdout_metrics.to_json(holdout_metrics_file, orient='records', lines=True)
        print(f"Holdout metrics saved to {holdout_metrics_file}")

        # Load SHAP values from the saved JSON file
        with open(os.path.join(predictor.model_dir, 'shap_values.json'), 'r') as f:
            shap_values_dict = json.load(f)

        average_shap_values_dict = average_shap_values(shap_values_dict)
        feature_names = datasets["full_data"]["train"].drop(columns=['merged_category'], errors='ignore').columns.tolist()
        combined_df = combine_and_average_shap_values(average_shap_values_dict, feature_names)
        combined_df.to_csv(os.path.join(predictor.model_dir, 'combined_shap_values.csv'), index=False)

        plot_combined_shap_values(combined_df)

        # Perform ensemble evaluation
        #ensemble_predictor = EnsemblePredictor(model_dir=predictor.model_dir, slack_channel=slack_channel)
        #ensemble_metrics = ensemble_predictor.evaluate_on_holdout(datasets['full_data']['test'])
        #print(ensemble_metrics)

    except Exception as e:
        # Send a Slack message if an error occurs
        send_slack_message(slack_channel, f"Script encountered an error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

[I 2024-10-02 14:02:32,518] A new study created in memory with name: no-name-323936ac-68f6-4ec7-b225-631470000905
INFO:root:Starting trial 0 for F0.5 optimization
INFO:root:Starting trial 0 for F0.5 optimization
INFO:root:Trial 0 completed with average F0.5 Score: 0.6798340377253819
INFO:root:Message sent to channel #hyperparametertuning: Trial 0 completed with average F0.5 Score: 0.6798340377253819
[I 2024-10-02 14:02:42,431] Trial 0 finished with value: 0.6798340377253819 and parameters: {'xgb_learning_rate': 0.04278396659243537, 'xgb_n_estimators': 320, 'xgb_max_depth': 9, 'xgb_subsample': 0.7502537919859544, 'xgb_colsample_bytree': 0.8117109216461992, 'xgb_reg_alpha': 5.319909976751335e-08, 'xgb_reg_lambda': 0.0006701803741664304, 'threshold': 0.46661483157409867}. Best is trial 0 with value: 0.6798340377253819.
INFO:root:Starting trial 1 for F0.5 optimization
INFO:root:Trial 1 completed with average F0.5 Score: 0.5817754555440829
[I 2024-10-02 14:02:51,449] Trial 1 finished with v