In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import torch

def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result

def prediction_model_RF_multilabel_basic(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension, data_source="one"):
    
    # Convert embeddings to numpy array if it's a tensor
    if isinstance(X_Embeddings_Autoencoder, torch.Tensor):
        X = X_Embeddings_Autoencoder.numpy()
        print("Way A.1")
    else:
        X = np.array(X_Embeddings_Autoencoder).reshape(-1, 1)  # Ensure it's 2D
        print("Way A.2")
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    # Convert the classification report into a DataFrame
    statistics_RF = pd.DataFrame(report).transpose()
    
    # Construct a model label
    applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_RF['Parameters'] = "Basic"
    statistics_RF['Applications'] = applications_label
    statistics_RF['Applications_Condition'] = f"{subject_label}_RF"
    
    # Create a summary DataFrame
    statistics_RF_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": "RF", 
        
        "Parameters": ["Basic"],
        "Accuracy": [accuracy_score(y_test, y_pred)],
        "Precision": [precision_score(y_test, y_pred, average='weighted', zero_division=0)],
        "Recall": [recall_score(y_test, y_pred, average='weighted', zero_division=0)],
        "F1": [f1_score(y_test, y_pred, average='weighted', zero_division=0)],
    })
    
    # Package results into a dictionary
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_RF,
        "Statistics_DF": statistics_RF_DF
    }

    return result_dict

In [None]:
# Required Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np
import torch

def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result

def prediction_model_RF_multilabel(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension):
    # Enable debug mode for verbose output
    debug = True

    # Convert embeddings to numpy array if it's a tensor
    if isinstance(X_Embeddings_Autoencoder, torch.Tensor):
        X = X_Embeddings_Autoencoder.numpy()
        print("Way A.1")
    else:
        X = np.array(X_Embeddings_Autoencoder).reshape(-1, 1)  # Ensure it's 2D
        print("Way A.2")

    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array

    # Step 1: Data Preprocessing - Splitting the dataset for training and testing
    X_RF_train, X_RF_test, y_RF_train, y_RF_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Step 2: Pipeline Setup - Defining preprocessing and model training steps
    pipeline = imblearnPipeline([
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('rf', OneVsRestClassifier(RandomForestClassifier(random_state=42), n_jobs=-1))  # Model definition
    ])

    # Step 3: Hyperparameter Tuning - Setting up the grid search
    param_grid = {
        'rf__estimator__n_estimators': [100], #[100, 200, 300],
        'rf__estimator__max_depth': [10], #[None, 10, 20, 30],
        'rf__estimator__min_samples_leaf': [2], #[1, 2, 4],
        'rf__estimator__min_samples_split': [2] # [2, 5, 10]
    }

    # Step 4: Model Selection - Executing grid search to find the best model
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_RF_train, y_RF_train)

    # Step 5: Model Evaluation - Assessing model performance
    best_pipeline = grid_search.best_estimator_
    y_pred = best_pipeline.predict(X_RF_test)
    report = classification_report(y_RF_test, y_pred, output_dict=True, zero_division=0)
    statistics_RF = pd.DataFrame(report).transpose()

    # Step 6: Results Compilation - Summarizing the model's performance metrics
    model_label = f"Outlier_{subject_outlier}_Normalization_{subject_normalization}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_RF['Parameters'] = str(grid_search.best_params_)
    statistics_RF['Model'] = model_label
    statistics_RF['Model_Condition'] = f"{subject_label}_RF"

    # Step 7: Detailed Statistics - Preparing a detailed statistics DataFrame
    statistics_RF_Detailed = statistics_RF[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]
    print(f"Model Label: {model_label}")
    print(statistics_RF_Detailed)

    # Step 8: Summary Statistics - Creating a summary DataFrame for quick reference
    statistics_RF_DF = pd.DataFrame({
        "Applications": [model_label],
        "Applications_Condition": [subject_label],
        "Model": "RF", 
        
        "Parameters": [str(grid_search.best_params_)],
        "Accuracy": [accuracy_score(y_RF_test, y_pred)],
        "Precision": [statistics_RF_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_RF_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_RF_Detailed.loc["weighted avg", "f1-score"]],
    })

    # Step 9: Packaging Results - Organizing all relevant model details into a dictionary
    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_RF_Detailed,
        "Statistics_DF": statistics_RF_DF
    }

    # Returning the compiled results for further analysis or reporting
    return result_dict