### Importing necessary libraries

In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, classification_report, roc_auc_score, roc_curve
import xgboost as xgb
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning
warnings.filterwarnings("ignore")


from scripts.modeling_preparation.snap_modeling_prep import SnapProcessor
from scripts.modeling_preparation.social_modeling_prep import SocialProcessor
from scripts.config import SAVED_EMBEDDINGS_SNAP_DIR, PROCESSED_SNAP_MESSAGING_DIR, PROCESSED_SNAP_EMA_DIR
from scripts.config import SAVED_EMBEDDINGS_SOCIAL_DIR, PROCESSED_SOCIAL_MESSAGING_DIR, PROCESSED_SOCIAL_EMA_DIR

### Instantiating the baseline model

Utilizing the following features:

      1. AvgWordCount
      2. AvgResponseTime
      3. MessageCount

#### SNAP Data

In [2]:
processor = SnapProcessor(
    model_name='distilbert/distilbert-base-uncased', 
    relationship='Romantic Partner', 
    save_dir=SAVED_EMBEDDINGS_SNAP_DIR,
    embedding_method='mean_pooling'
)

snap_df = processor.prepare_modeling_df(
    messaging_dir=PROCESSED_SNAP_MESSAGING_DIR, 
    ema_dir=PROCESSED_SNAP_EMA_DIR, 
    message_count=True,
    embeddings=False, 
    sentiment_analysis=False,
    return_train_test=False
)

In [None]:
snap_df.head()

### Creating the test split based on future dates

In [None]:
snap_df_sorted = snap_df.sort_values(by=['Participant ID','Date'])
snap_df_sorted.head()

In [5]:
# Create empty DataFrames to hold the training and test data
snap_train_df = pd.DataFrame()
snap_test_df = pd.DataFrame()

# Step 2: Group by 'Participant ID' and split each group
split_ratio = 0.8

for participant_id, group in snap_df_sorted.groupby('Participant ID'):
    group = group.sort_values(by='Date')  # Ensure the group is sorted by date
    
    if len(group) > 1:  # Only include in the test set if more than 1 day of data
        split_point = int(len(group) * split_ratio)
        
        # Training data will be the earlier dates
        train_participant = group.iloc[:split_point]
        
        # Test data will be the later dates
        test_participant = group.iloc[split_point:]
        
        # Append to the training and test DataFrames
        snap_train_df = pd.concat([snap_train_df, train_participant])
        snap_test_df = pd.concat([snap_test_df, test_participant])
    else:
        # If only one day of data, include in the test set only
        snap_test_df = pd.concat([snap_test_df, group])

In [None]:
snap_train_df['Disagreement'].value_counts()

In [None]:
snap_test_df['Disagreement'].value_counts()

In [None]:
snap_train_df['Participant ID'].nunique()

In [None]:
snap_test_df['Participant ID'].nunique()

In [None]:
snap_train_df.head()

In [None]:
snap_test_df.head()

### Storing approach results

In [12]:
# Initialize a dictionary to store all results for this approach
approach_results = {
    'Internal Validation': {
        'Logistic Regression': {},
        'Decision Tree': {},
        'Random Forest': {},
        'XGBoost': {}
    },
    'External Validation': {
        'Logistic Regression': {},
        'Decision Tree': {},
        'Random Forest': {},
        'XGBoost': {}
    }
}

### Internal validation (20% of the SNAP Data)

In [13]:
X_train = snap_train_df.drop(['Participant ID', 'Date','Disagreement'],axis = 1)
y_train = snap_train_df['Disagreement']

X_test = snap_test_df.drop(['Participant ID', 'Date','Disagreement'],axis = 1)
y_test = snap_test_df['Disagreement']

In [None]:
X_train.head()

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
train_index = X_train.index
test_index = X_test.index

# Convert the scaled array back to a DataFrame
X_train = pd.DataFrame(X_train_scaled, index=train_index, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, index=test_index, columns=X_test.columns)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, average_precision_score, auc

def compute_class_weights(y_train):
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    return dict(enumerate(class_weights))

def fit_and_return_results(estimator, param_grid, seeds):
    balanced_accuracies = []
    auc_scores = []
    f1_macro_scores = []
    recall_1_scores = []
    classification_reports = []

    # Compute class weights
    class_weights = compute_class_weights(y_train)
    
    # If the estimator supports class_weight, set it
    if 'class_weight' in estimator.get_params():
        estimator.set_params(class_weight=class_weights)

    # Perform GridSearchCV for hyperparameter tuning with balanced accuracy
    print("Performing hyperparameter tuning...")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FitFailedWarning)
        warnings.simplefilter("ignore", category=ConvergenceWarning)
        grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, n_jobs=-1, scoring='balanced_accuracy')
        grid_search.fit(X_train, y_train)
        print(grid_search.get_params())
    
    # Print the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Use the best model from GridSearchCV
    best_model = grid_search.best_estimator_
    
    # Use the best hyperparameters for evaluation across multiple seeds
    for seed in seeds:
        print(f"\nUsing random seed: {seed}")

        # Set the random state for reproducibility
        if 'random_state' in best_model.get_params().keys():
            best_model.set_params(random_state=seed)
        
        # Train the model on the current seed's training data
        best_model.fit(X_train, y_train)

        print(best_model.get_params)
        
        # Make predictions
        y_pred = best_model.predict(X_test)
        
        # Evaluate the model using balanced accuracy
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_accuracies.append(balanced_accuracy)
        print("Balanced Accuracy:", balanced_accuracy)

        # Evaluate the model using f1-score(Macro)
        f1_macro_score = f1_score(y_test, y_pred, average='macro')
        f1_macro_scores.append(f1_macro_score)
        print("F-1 Score(Macro):", f1_macro_score)

        # Evaluate the model using recall(Class 1)
        recall_1_score = recall_score(y_test, y_pred, pos_label=1)
        recall_1_scores.append(recall_1_score)
        print("Recall (Class 1):", recall_1_score)
        
        # Print the classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        classification_reports.append(report)
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        # Calculate AUC
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_pred_proba)
        auc_scores.append(auc_score)
        print(f"AUC Score: {auc_score}")

        # Plot ROC curve
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        plt.show()
    
    # Calculate average metrics
    avg_balanced_accuracy = np.mean(balanced_accuracies)
    avg_auc_score = np.mean(auc_scores)
    avg_f1_macro_score = np.mean(f1_macro_scores)
    avg_recall_1_score = np.mean(recall_1_scores)

    print(f"\nAverage Balanced Accuracy: {avg_balanced_accuracy}")
    print(f"\nAverage AUC Score: {avg_auc_score}")
    print(f"Average F-1 Score (Macro) : {avg_f1_macro_score}")
    print(f"Average Recall Score(Class 1) : {avg_recall_1_score}")

    # Combine classification reports
    avg_classification_report = {}
    for label in classification_reports[0].keys():
        if isinstance(classification_reports[0][label], dict):
            avg_classification_report[label] = {}
            for metric in classification_reports[0][label].keys():
                avg_classification_report[label][metric] = np.mean([report[label][metric] for report in classification_reports])
        else:
            avg_classification_report[label] = np.mean([report[label] for report in classification_reports])
    
    # Convert average classification report to DataFrame
    avg_classification_df = pd.DataFrame(avg_classification_report).transpose()
    
    # Display the averaged classification report in the regular format
    avg_report_str = "Classification Report (Avg. scores across 10 runs)\n"
    avg_report_str += f"{'':<15}{'precision':<15}{'recall':<15}{'f1-score':<15}{'support':<15}\n\n"
    for label, metrics in avg_classification_report.items():
        if isinstance(metrics, dict):
            avg_report_str += f"{label:<15}{metrics['precision']:<15.2f}{metrics['recall']:<15.2f}{metrics['f1-score']:<15.2f}{metrics['support']:<15.0f}\n"
        else:
            if label == 'accuracy':
                avg_report_str += f"\n{label:<45}{metrics:<15.2f}\n"
            else:
                avg_report_str += f"{label:<15}{metrics:<15.2f}\n"
    print(avg_report_str)

    # Plot the results
    # Calculate means and standard deviations
    mean_balanced_accuracy = np.mean(balanced_accuracies)
    std_balanced_accuracy = np.std(balanced_accuracies)
    
    mean_f1_macro = np.mean(f1_macro_scores)
    std_f1_macro = np.std(f1_macro_scores)
    
    mean_recall_1 = np.mean(recall_1_scores)
    std_recall_1 = np.std(recall_1_scores)
    
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    
    # Metrics and their corresponding means and standard deviations
    metrics = ['Balanced Accuracy', 'F1-Score (Macro)', 'Recall (Class 1)', 'AUC ROC Score']
    means = [mean_balanced_accuracy, mean_f1_macro, mean_recall_1, mean_auc]
    stds = [std_balanced_accuracy, std_f1_macro, std_recall_1, std_auc]
    
    colors = ['red', 'green', 'blue', 'orange']

    # Plotting the bar chart with flipped axes and different colors
    plt.figure(figsize=(7, 2))
    bars = plt.barh(metrics, means, xerr=stds, capsize=5, color=colors, height = 0.3)
    plt.ylabel('Metrics')
    plt.xlabel('Mean Score')
    plt.title('Mean and Standard Deviation of Metrics After 10 Runs')
    plt.xlim(0, 1)  # assuming all metrics are in the range [0, 1]

    # Adjusting subplot to reduce spacing
    plt.subplots_adjust(left=0.2, right=0.8, top=0.9, bottom=0.1)
    
    # Adding text labels with mean and standard deviation values
    for bar, mean, std, color in zip(bars, means, stds, colors):
        xval = bar.get_width()
        offset = std * 1.2  # Dynamic adjustment based on the std value
        plt.text(xval + offset + 0.01, bar.get_y() + bar.get_height() / 2, f'{mean:.2f} ± {std:.2f}', va='center', ha='left', color=color)
        
    plt.show()

    key_metric_results = {
        'Balanced Accuracy': (mean_balanced_accuracy, std_balanced_accuracy),
        'F1-Score (Macro)': (mean_f1_macro, std_f1_macro),
        'Recall (Class 1)': (mean_recall_1, std_recall_1),
        'AUC ROC Score': (mean_auc, std_auc)
    }
    return key_metric_results

# Generate a list of random seeds
base_seed = 42
random.seed(base_seed)
seeds = [random.randint(1, 1000) for _ in range(10)]
print("Random Seeds:", seeds)

In [17]:
from sklearn.metrics import f1_score, recall_score

In [None]:
from sklearn.utils import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y = y_train)
class_weights

#### 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300]
}

lr = LogisticRegression(random_state = 42)
approach_results['Internal Validation']['Logistic Regression'] = fit_and_return_results(estimator = lr, param_grid = param_grid_lr,
                                                                                        seeds = seeds)

#### 2. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

dtc = DecisionTreeClassifier(random_state = 42)
approach_results['Internal Validation']['Decision Tree'] = fit_and_return_results(estimator = dtc, param_grid = param_grid_dt,
                                                                                        seeds = seeds)

#### 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


rf = RandomForestClassifier(random_state=42)
approach_results['Internal Validation']['Random Forest'] = fit_and_return_results(estimator = rf, param_grid = param_grid_rf, seeds = seeds)

#### 4. XGBoost

In [None]:
import xgboost as xgb

param_grid_xgb = {
    'n_estimators': [100, 200, 500], 
    'max_depth': [3, 5],  
    'learning_rate': [0.1, 0.2],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0],  
    'gamma': [0, 0.1],
    'scale_pos_weight': [1, 3, 5]
}


xgb_model = xgb.XGBClassifier(random_state=42)
approach_results['Internal Validation']['XGBoost'] = fit_and_return_results(estimator = xgb_model, param_grid = param_grid_xgb,
                                                                                  seeds = seeds)

In [23]:
#print(json.dumps(approach_results,indent = 4))

### External Validation

In [25]:
snap_processor = SnapProcessor(
    relationship='Romantic Partner'
)

snap_df = snap_processor.prepare_modeling_df(
    messaging_dir=PROCESSED_SNAP_MESSAGING_DIR, 
    ema_dir=PROCESSED_SNAP_EMA_DIR, 
    message_count=True,
    embeddings = False
)


social_processor = SocialProcessor(
    relationship='Romantic Partner', 
)

social_df = social_processor.prepare_modeling_df(
    messaging_dir=PROCESSED_SOCIAL_MESSAGING_DIR, 
    ema_dir=PROCESSED_SOCIAL_EMA_DIR, 
    message_count=True,
    embeddings = False
)

In [26]:
# Defining train and test variables
X_train = snap_df.drop(['Participant ID', 'Date','Disagreement'], axis=1)
y_train = snap_df['Disagreement']

# External test set
X_test = social_df.drop(['Participant ID', 'Date', 'Disagreement'], axis=1)
y_test = social_df['Disagreement']

In [None]:
X_train

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
train_index = X_train.index
test_index = X_test.index

# Convert the scaled array back to a DataFrame
X_train = pd.DataFrame(X_train_scaled, index=train_index, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, index=test_index, columns=X_test.columns)

In [None]:
def compute_class_weights(y_train):
    from sklearn.utils.class_weight import compute_class_weight
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    return dict(enumerate(class_weights))

def fit_and_return_results(estimator, param_grid, seeds):
    balanced_accuracies = []
    auc_scores = []
    f1_macro_scores = []
    recall_1_scores = []
    classification_reports = []

    for seed in seeds:
        print(f"\nUsing random seed: {seed}")
        
        # Compute class weights
        class_weights = compute_class_weights(y_train)
        
        # If the estimator supports class_weight, set it
        if 'class_weight' in estimator.get_params():
            estimator.set_params(class_weight=class_weights)

        # Perform GridSearchCV for hyperparameter tuning with balanced accuracy
        print("Performing hyperparameter tuning...")
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", FitFailedWarning)
            warnings.simplefilter("ignore", category=ConvergenceWarning)
            grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, n_jobs=-1, scoring='balanced_accuracy')
            grid_search.fit(X_train, y_train)
        
        # Print the best hyperparameters
        best_params = grid_search.best_params_
        print("Best Hyperparameters:", best_params)

        # Use the best model from GridSearchCV
        best_model = grid_search.best_estimator_

        # # Set the random state for reproducibility in the best model
        if 'random_state' in best_model.get_params():
            best_model.set_params(random_state=seed)

        # Train the model on the current seed's training data
        best_model.fit(X_train, y_train)

        # Make predictions
        y_pred = best_model.predict(X_test)
        
        # Evaluate the model using balanced accuracy
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        balanced_accuracies.append(balanced_accuracy)
        print("Balanced Accuracy:", balanced_accuracy)

        # Evaluate the model using f1-score(Macro)
        f1_macro_score = f1_score(y_test, y_pred, average='macro')
        f1_macro_scores.append(f1_macro_score)
        print("F-1 Score(Macro):", f1_macro_score)

        # Evaluate the model using recall(Class 1)
        recall_1_score = recall_score(y_test, y_pred, pos_label=1)
        recall_1_scores.append(recall_1_score)
        print("Recall (Class 1):", recall_1_score)
        
        # Print the classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        classification_reports.append(report)
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        # Calculate AUC
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_pred_proba)
        auc_scores.append(auc_score)
        print(f"AUC Score: {auc_score}")
        
        # Plot ROC curve
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        plt.show()
    
    # Calculate average metrics
    avg_balanced_accuracy = np.mean(balanced_accuracies)
    avg_auc_score = np.mean(auc_scores)
    avg_f1_macro_score = np.mean(f1_macro_scores)
    avg_recall_1_score = np.mean(recall_1_scores)

    print(f"\nAverage Balanced Accuracy: {avg_balanced_accuracy}")
    print(f"Average AUC Score: {avg_auc_score}")
    print(f"Average F-1 Score (Macro) : {avg_f1_macro_score}")
    print(f"Average Recall Score(Class 1) : {avg_recall_1_score}")

    # Combine classification reports
    avg_classification_report = {}
    for label in classification_reports[0].keys():
        if isinstance(classification_reports[0][label], dict):
            avg_classification_report[label] = {}
            for metric in classification_reports[0][label].keys():
                avg_classification_report[label][metric] = np.mean([report[label][metric] for report in classification_reports])
        else:
            avg_classification_report[label] = np.mean([report[label] for report in classification_reports])
    
    # Convert average classification report to DataFrame
    avg_classification_df = pd.DataFrame(avg_classification_report).transpose()
    
    # Display the averaged classification report in the regular format
    avg_report_str = "Classification Report (Avg. scores across 10 runs)\n"
    avg_report_str += f"{'':<15}{'precision':<15}{'recall':<15}{'f1-score':<15}{'support':<15}\n\n"
    for label, metrics in avg_classification_report.items():
        if isinstance(metrics, dict):
            avg_report_str += f"{label:<15}{metrics['precision']:<15.2f}{metrics['recall']:<15.2f}{metrics['f1-score']:<15.2f}{metrics['support']:<15.0f}\n"
        else:
            if label == 'accuracy':
                avg_report_str += f"\n{label:<45}{metrics:<15.2f}\n"
            else:
                avg_report_str += f"{label:<15}{metrics:<15.2f}\n"
    print(avg_report_str)

    # Plot the results
    # Calculate means and standard deviations
    mean_balanced_accuracy = np.mean(balanced_accuracies)
    std_balanced_accuracy = np.std(balanced_accuracies)
    
    mean_f1_macro = np.mean(f1_macro_scores)
    std_f1_macro = np.std(f1_macro_scores)
    
    mean_recall_1 = np.mean(recall_1_scores)
    std_recall_1 = np.std(recall_1_scores)
    
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    
    # Metrics and their corresponding means and standard deviations
    metrics = ['Balanced Accuracy', 'F1-Score (Macro)', 'Recall (Class 1)', 'AUC ROC Score']
    means = [mean_balanced_accuracy, mean_f1_macro, mean_recall_1, mean_auc]
    stds = [std_balanced_accuracy, std_f1_macro, std_recall_1, std_auc]
    
    colors = ['red', 'green', 'blue', 'orange']

    # Plotting the bar chart with flipped axes and different colors
    plt.figure(figsize=(7, 2))
    bars = plt.barh(metrics, means, xerr=stds, capsize=5, color=colors, height = 0.3)
    plt.ylabel('Metrics')
    plt.xlabel('Mean Score')
    plt.title('Mean and Standard Deviation of Metrics After 10 Runs')
    plt.xlim(0, 1)  # assuming all metrics are in the range [0, 1]

    # Adjusting subplot to reduce spacing
    plt.subplots_adjust(left=0.2, right=0.8, top=0.9, bottom=0.1)
    
    # Adding text labels with mean and standard deviation values
    for bar, mean, std, color in zip(bars, means, stds, colors):
        xval = bar.get_width()
        offset = std * 1.2  # Dynamic adjustment based on the std value
        plt.text(xval + offset + 0.01, bar.get_y() + bar.get_height() / 2, f'{mean:.2f} ± {std:.2f}', va='center', ha='left', color=color)
        
    plt.show()

    key_metric_results = {
        'Balanced Accuracy': (mean_balanced_accuracy, std_balanced_accuracy),
        'F1-Score (Macro)': (mean_f1_macro, std_f1_macro),
        'Recall (Class 1)': (mean_recall_1, std_recall_1),
        'AUC ROC Score': (mean_auc, std_auc)
    }
    return key_metric_results


# Generate a list of random seeds
base_seed = 42
random.seed(base_seed)
seeds = [random.randint(1, 1000) for _ in range(10)]
print("Random Seeds:", seeds)

In [None]:
from sklearn.utils import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y = y_train)
class_weights

#### 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300]
}

lr = LogisticRegression(random_state=42)
approach_results['External Validation']['Logistic Regression'] = fit_and_return_results(estimator = lr, param_grid = param_grid_lr,
                                                                                        seeds = seeds)

#### 2. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

dtc = DecisionTreeClassifier(random_state = 42)
approach_results['External Validation']['Decision Tree'] = fit_and_return_results(estimator = dtc, param_grid = param_grid_dt, seeds = seeds)

#### 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


rf = RandomForestClassifier(random_state=42)
approach_results['External Validation']['Random Forest'] = fit_and_return_results(estimator = rf, param_grid = param_grid_rf, seeds = seeds)

#### 4. XGBoost

In [None]:
param_grid_xgb = {
    'n_estimators': [100, 200, 500], 
    'max_depth': [3, 5],  
    'learning_rate': [0.1, 0.2],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0],  
    'gamma': [0, 0.1],
    'scale_pos_weight': [1, 3, 5]
}


xgb_model = xgb.XGBClassifier(random_state = 42)
approach_results['External Validation']['XGBoost'] = fit_and_return_results(estimator = xgb_model, param_grid = param_grid_xgb, seeds = seeds)

In [None]:
print(json.dumps(approach_results, indent=4))