In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch

def prediction_model_DT_multilabel_basic(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension, data_source="one"):
    
    # Convert embeddings to numpy array if it's a tensor
    if isinstance(X_Embeddings_Autoencoder, torch.Tensor):
        X = X_Embeddings_Autoencoder.numpy()
        print("Way A.1")
    else:
        X = np.array(X_Embeddings_Autoencoder).reshape(-1, 1)  # Ensure it's 2D
        print("Way A.2")
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initializing and training the DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Making predictions on the test set
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    # Convert the classification report into a DataFrame for detailed statistics
    statistics_DT = pd.DataFrame(report).transpose()
    
    # Constructing a model label based on the provided parameters
    applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_DT['Parameters'] = "Basic"
    statistics_DT['Applications'] = applications_label
    statistics_DT['Applications_Condition'] = f"{subject_label}_DT"
    
    # Creating a summary DataFrame for an overview of the model's performance
    statistics_DT_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": "DT", 
        "Parameters": ["Basic"],
        "Accuracy": [accuracy_score(y_test, y_pred)],
        "Precision": [precision_score(y_test, y_pred, average='weighted', zero_division=0)],
        "Recall": [recall_score(y_test, y_pred, average='weighted', zero_division=0)],
        "F1": [f1_score(y_test, y_pred, average='weighted', zero_division=0)],
    })
    
    # Packaging results into a dictionary for easy access and interpretation
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_DT,
        "Statistics_DF": statistics_DT_DF
    }

    return result_dict

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.multiclass import OneVsRestClassifier
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import torch

def prediction_model_DT_multilabel(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension):
    debug = True

    # Convert embeddings to numpy array if it's a tensor
    if isinstance(X_Embeddings_Autoencoder, torch.Tensor):
        X = X_Embeddings_Autoencoder.numpy()
        print("Way A.1")
    else:
        X = np.array(X_Embeddings_Autoencoder).reshape(-1, 1)  # Ensure it's 2D
        print("Way A.2")

    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array

    # Step 1: Configuring the function and splitting the dataset
    # Splitting the dataset into training and testing sets for model validation
    X_DT_train, X_DT_test, y_DT_train, y_DT_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Step 2: Pipeline Creation
    # Defining a machine learning pipeline for preprocessing and model application
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),  # Feature scaling for normalization
        ('smote', SMOTE(random_state=42)),  # Applying SMOTE for class imbalance
        ('clf', OneVsRestClassifier(DecisionTreeClassifier(random_state=42)))  # Decision Tree Classifier for multilabel classification
    ])

    # Step 3: Hyperparameter Grid Setup
    # Defining the parameter grid for hyperparameter optimization
    param_grid = {
        'clf__estimator__max_depth': [10],#[None, 10, 20, 30],
        'clf__estimator__min_samples_leaf': [2],#[1, 2, 4],
        'clf__estimator__min_samples_split': [5]#[2, 5, 10]
    }

    # Step 4: Model Training with Grid Search
    # Configuring and performing grid search cross-validation to find the best model configuration
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=make_scorer(accuracy_score), n_jobs=-1)
    grid_search.fit(X_DT_train, y_DT_train)

    # Step 5: Best Model Selection
    # Extracting the best estimator after the grid search
    best_pipeline = grid_search.best_estimator_

    # Step 6: Model Evaluation
    # Using the best model to predict on the test set and evaluating its performance
    y_pred = best_pipeline.predict(X_DT_test)
    report = classification_report(y_DT_test, y_pred, output_dict=True, zero_division=0)
    statistics_DT = pd.DataFrame(report).transpose()

    # Step 7: Statistics Compilation
    # Compiling a model label and detailed statistics for the best model
    model_label = f"Outlier_{subject_outlier}_Normalization_{subject_normalization}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_DT['Parameters'] = str(grid_search.best_params_)
    statistics_DT['Model'] = model_label
    statistics_DT['Model_Condition'] = f"{subject_label}_DT" 

    # Step 8: Summary DataFrame Creation
    # Creating a summary DataFrame with the model's performance metrics
    statistics_DT_DF = pd.DataFrame({
        "Applications": [model_label],
        "Applications_Condition": [subject_label],
        "Model": "DT", 

        "Parameters": [str(grid_search.best_params_)], 
        "Accuracy": [accuracy_score(y_DT_test, y_pred)],
        "Precision": [statistics_DT.loc["weighted avg", "precision"]],
        "Recall": [statistics_DT.loc["weighted avg", "recall"]],
        "F1": [statistics_DT.loc["weighted avg", "f1-score"]],
    })

    # Step 9: Result Packaging
    # Packaging the best model, predictions, and performance metrics into a dictionary for output
    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_DT,
        "Statistics_DF": statistics_DT_DF
    }

    # Return the result dictionary containing all relevant information
    return result_dict