In [None]:
print("Tools of ML")

In [None]:
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, classification_report
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

### SGD | Advanced | Multilabel

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd

def prediction_model_SGD_multilabel(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension):
    # Enable debug prints if debug variable is True
    debug = True
    
    # Step 1: Data Splitting
    # Splitting the dataset into training and testing sets for model validation
    X_train, X_test, y_train, y_test = train_test_split(
        X_Gene_Marker_Embeddings, y_Cell_Type, test_size=0.2, random_state=42
    )
    
    # Step 2: Pipeline Creation
    # Defining a machine learning pipeline for preprocessing and model application
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()), # Feature scaling for normalization
        ('smote', SMOTE(random_state=42)), # Applying SMOTE for class imbalance
        ('sgd', OneVsRestClassifier(SGDClassifier(loss='log_loss', random_state=42))) # SGD Classifier for multilabel classification
    ])
    
    # Step 3: Hyperparameter Grid Setup
    # Defining the parameter grid for hyperparameter optimization
    param_grid = {
        'sgd__estimator__alpha': [0.0001, 0.001, 0.01], # Regularization strength
        'sgd__estimator__max_iter': [1000, 2000], # Number of iterations for convergence
        'sgd__estimator__tol': [1e-3, 1e-4], # Tolerance for stopping criteria
        'sgd__estimator__penalty': ['l2', 'l1', 'elasticnet'] # Type of regularization
    }
    
    # Step 4: Model Training with Grid Search
    # Configuring and performing grid search cross-validation to find the best model configuration
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Step 5: Best Model Selection
    # Extracting the best estimator after the grid search
    best_model = grid_search.best_estimator_
    
    # Step 6: Model Evaluation
    # Using the best model to predict on the test set and evaluating its performance
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    statistics_SGD = pd.DataFrame(report).transpose()
    
    # Step 7: Statistics Compilation
    # Compiling a model label and detailed statistics for the best model

    model_label = f"Outlier_{subject_outlier}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_SGD['Parameters'] = str(grid_search.best_params_)
    statistics_SGD['Model'] = model_label
    statistics_SGD['Model_Condition'] = f"{subject_label}_SGD" 

    # Step 8: Summary DataFrame Creation
    # Creating a summary DataFrame with the model's performance metrics
    statistics_SGD_DF = pd.DataFrame({
        "Model": [model_label],
        "Accuracy": [accuracy_score(y_test, y_pred)],
        "Precision": [precision_score(y_test, y_pred, average='weighted')],
        "Recall": [recall_score(y_test, y_pred, average='weighted')],
        "F1": [f1_score(y_test, y_pred, average='weighted')],
        "Parameters": [str(grid_search.best_params_)],
    })
    
    # Step 9: Result Packaging
    # Packaging the best model, predictions, and performance metrics into a dictionary for output
    result_dict = {
        "Model": best_model,
        "Predictions": y_pred,
        "Statistics": statistics_SGD,
        "Statistics_DF": statistics_SGD_DF
    }

    # Return the result dictionary containing all relevant information
    return result_dict


### SVM | Advanced | Multilabel

In [None]:
# Required libraries
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np
import os
import pickle

def prediction_model_SVM_with_GridSearch(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension):
    debug = True
    
    # Step 1: Data Preprocessing
    # Splitting the data into training and testing sets
    X_SVM_train, X_SVM_test, y_SVM_train, y_SVM_test = train_test_split(
        X_Gene_Marker_Embeddings, y_Cell_Type, test_size=0.2, random_state=42
    )  # Note: For multi-label data, stratify might not work as expected
    
    # Step 2: Setting up the Machine Learning Pipeline
    # Configuring a pipeline that includes preprocessing and the model
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),  # Normalization
        ('normalizer', MinMaxScaler()),  # Normalization - optional
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('svc', OneVsRestClassifier(SVC(random_state=42, class_weight='balanced'), n_jobs=-1))  # SVM classifier
    ])
    
    # Step 3: Defining Hyperparameters for Grid Search
    # Specifying the grid of parameters to search over
    param_grid = {
        'svc__estimator__degree': [2, 3, 4],
        'svc__estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'svc__estimator__coef0': [0.0, 0.5, 1.0],
        'svc__estimator__C': [0.1, 1, 10, 100, 1000],
        'svc__estimator__gamma': ['scale', 'auto']
    }
    
    # Step 4: Cross-Validation Setup
    # Preparing the cross-validation setup for model selection
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 5: Scoring Metrics
    # Defining the scoring metrics for evaluating models
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'balanced_accuracy': 'balanced_accuracy'
    }
    
    # Step 6: Performing Grid Search
    # Conducting the grid search to find the best model parameters
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_SVM_train, y_SVM_train)  # Training
    
    # Step 7: Model Selection
    # Selecting the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Step 8: Model Evaluation
    # Evaluating the best model on the test set
    y_pred = best_model.predict(X_SVM_test)
    report = classification_report(y_SVM_test, y_pred, output_dict=True)
    statistics_SVM = pd.DataFrame(report).transpose()
    
    # Step 9: Results Compilation
    # Compiling the results into a summary table
    model_label = f"Outlier_{subject_outlier}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_SVM['Parameters'] = str(grid_search.best_params_)
    statistics_SVM['Model'] = model_label
    statistics_SVM['Model_Condition'] = f"{subject_label}_DT" 


    statistics_SVM_Detailed = statistics_SVM[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]

    # Displaying the detailed results
    print(f"Model Label: {model_label}")
    print(statistics_SVM_Detailed)
    
    # Step 10: Summary DataFrame
    # Creating a DataFrame with summary statistics
    statistics_SVM_DF = pd.DataFrame({
        "Model": [f"{subject_outlier}_{subject_autoencoder}_{subject_dimension}"],
        "Model_Condition": [subject_label],
        "Accuracy": [statistics_SVM_Detailed.loc["accuracy", "support"]],
        "Precision": [statistics_SVM_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_SVM_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_SVM_Detailed.loc["weighted avg", "f1-score"]],
        "Parameters": [str(grid_search.best_params_)],
    })
    
    # Packaging the results into a dictionary
    result_dict = {
        "Model": best_model,
        "Predictions": y_pred,
        "Statistics": statistics_SVM_Detailed,
        "Statistics_DF": statistics_SVM_DF
    }

    return result_dict  # Returning the result dictionary


### SVM | Basic | Multilabel

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd

def prediction_model_SVM(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label):
    debug = True
    [print("Control Point: 1.0 | SVM Model") if debug else None]

    # CONFIG 
    X_SVM_train, X_SVM_test, y_SVM_train, y_SVM_test = train_test_split(X_Gene_Marker_Embeddings, y_Cell_Type,
                                                                        test_size=0.2, random_state=42,
                                                                        stratify=y_Cell_Type)  # Ensure stratification
    [print("Control Point: 2.0 | SVM Model Setup") if debug else None]
    # Simplified pipeline with optimized parameters
    # Modified pipeline with normalization
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),
        ('normalizer', MinMaxScaler()),  # Add this line if you decide normalization is needed
        ('smote', SMOTE(random_state=42)),
        ('svc', SVC(kernel='poly', degree=2, coef0=0.5, C=1, gamma='scale', random_state=42, class_weight='balanced'))
    ])

    [print("Control Point: 3.0 | SVM Model Training") if debug else None]
    # Training the model with the optimized parameters
    pipeline.fit(X_SVM_train, y_SVM_train)

    [print("Control Point: 4.0 | SVM Model Prediction") if debug else None]
    y_pred = pipeline.predict(X_SVM_test)

    [print("Control Point: 5.0 | SVM Model Evaluation") if debug else None]
    report = classification_report(y_SVM_test, y_pred, output_dict=True)
    statistics_SVM = pd.DataFrame(report).transpose()

    # Simplifying the model label based on the fixed parameters
    model_label = f"SVM_Kernel_poly_Degree_2_Coef0_0.5_C_1_Gamma_scale_{subject_label}"
    statistics_SVM['Parameters']      = model_label  
    statistics_SVM['Model']           = f"{subject_outlier}_{subject_autoencoder}_{subject_dimension}"  
    statistics_SVM['Model_Condition'] = f"{subject_label}" 

    # Displaying the results
    statistics_SVM_Detailed = statistics_SVM[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]
    print(f"Model Label: {model_label}")
    print(statistics_SVM_Detailed)

    # Step 10: Return the model label and detailed statistics 
    [print("Control Point: 10.0 | SVM Model") if debug else None] 
    
    statistics_SVM_Parameters       = statistics_SVM_Detailed.loc["accuracy", "Parameters"]  
    statistics_SVM_Model            = statistics_SVM_Detailed.loc["accuracy", "Model"]  
    statistics_SVM_Model_Condition  = statistics_SVM_Detailed.loc["accuracy", "Model_Condition"]  
    
    statistics_SVM_Accuracy  = statistics_SVM_Detailed.loc["accuracy", "support"] 
    statistics_SVM_Precision = statistics_SVM_Detailed.loc["weighted avg", "precision"] 
    statistics_SVM_Recall    = statistics_SVM_Detailed.loc["weighted avg", "recall"]
    statistics_SVM_F1        = statistics_SVM_Detailed.loc["weighted avg", "f1-score"] 
    [print("Control Point: 10.1 | SVM Model") if debug else None] 
    
    statistics_SVM_DF = pd.DataFrame({  
                                        "Model": [statistics_SVM_Model],
                                        "Model_Condition": [statistics_SVM_Model_Condition],
                                        "Accuracy": [statistics_SVM_Accuracy],
                                        "Precision": [statistics_SVM_Precision],
                                        "Recall": [statistics_SVM_Recall],
                                        "F1": [statistics_SVM_F1],
                                        "Parameters": [statistics_SVM_Parameters],
                                        }) 
    [print("Control Point: 10.2 | SVM Model") if debug else None] 
    result_dict = {"Model": pipeline,
                    "Predictions": y_pred,
                    "Statistics": statistics_SVM_Detailed,
                    "Statistics_DF": statistics_SVM_DF}
    [print("Control Point: 10.3 | SVM Model") if debug else None] 
    return result_dict 


### Decision Tree | Advanced | Multilabel

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.multiclass import OneVsRestClassifier
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd

def prediction_model_DT_multilabel(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension):
    debug = True

    # Step 1: Configuring the function and splitting the dataset
    # Splitting the dataset into training and testing sets for model validation
    X_DT_train, X_DT_test, y_DT_train, y_DT_test = train_test_split(
        X_Gene_Marker_Embeddings, y_Cell_Type, test_size=0.2, random_state=42
    )

    # Step 2: Pipeline Creation
    # Defining a machine learning pipeline for preprocessing and model application
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),  # Feature scaling for normalization
        ('smote', SMOTE(random_state=42)),  # Applying SMOTE for class imbalance
        ('clf', OneVsRestClassifier(DecisionTreeClassifier(random_state=42)))  # Decision Tree Classifier for multilabel classification
    ])

    # Step 3: Hyperparameter Grid Setup
    # Defining the parameter grid for hyperparameter optimization
    param_grid = {
        'clf__estimator__max_depth': [None, 10, 20, 30],
        'clf__estimator__min_samples_leaf': [1, 2, 4],
        'clf__estimator__min_samples_split': [2, 5, 10]
    }

    # Step 4: Model Training with Grid Search
    # Configuring and performing grid search cross-validation to find the best model configuration
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=make_scorer(accuracy_score), n_jobs=-1)
    grid_search.fit(X_DT_train, y_DT_train)

    # Step 5: Best Model Selection
    # Extracting the best estimator after the grid search
    best_pipeline = grid_search.best_estimator_

    # Step 6: Model Evaluation
    # Using the best model to predict on the test set and evaluating its performance
    y_pred = best_pipeline.predict(X_DT_test)
    report = classification_report(y_DT_test, y_pred, output_dict=True, zero_division=0)
    statistics_DT = pd.DataFrame(report).transpose()

    # Step 7: Statistics Compilation
    # Compiling a model label and detailed statistics for the best model
    model_label = f"Outlier_{subject_outlier}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_DT['Parameters'] = str(grid_search.best_params_)
    statistics_DT['Model'] = model_label
    statistics_DT['Model_Condition'] =  f"{subject_label}_DT" 

    # Step 8: Summary DataFrame Creation
    # Creating a summary DataFrame with the model's performance metrics
    statistics_DT_DF = pd.DataFrame({
        "Model": [model_label],
        "Accuracy": [accuracy_score(y_DT_test, y_pred)],
        "Precision": [statistics_DT.loc["weighted avg", "precision"]],
        "Recall": [statistics_DT.loc["weighted avg", "recall"]],
        "F1": [statistics_DT.loc["weighted avg", "f1-score"]],
        "Parameters": [str(grid_search.best_params_)]
    })

    # Step 9: Result Packaging
    # Packaging the best model, predictions, and performance metrics into a dictionary for output
    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_DT,
        "Statistics_DF": statistics_DT_DF
    }

    # Return the result dictionary containing all relevant information
    return result_dict


### Decision Tree | Basic | Multilabel

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd

def optimized_prediction_model_DecisionTree(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension):
    debug = True
    [print("Control Point: 1.0 | Decision Tree Model") if debug else None]

    # CONFIG 
    X_DT_train, X_DT_test, y_DT_train, y_DT_test = train_test_split(X_Gene_Marker_Embeddings, y_Cell_Type,
                                                                    test_size=0.2, random_state=42,
                                                                    stratify=y_Cell_Type)
    [print("Control Point: 2.0 | Decision Tree Model Setup") if debug else None]
    
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('clf', DecisionTreeClassifier(random_state=42))
    ])
    
    param_grid = {
        'clf__max_depth': [None, 10, 20, 30],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__min_samples_split': [2, 5, 10]
    }
    
    [print("Control Point: 3.0 | Decision Tree Model Training") if debug else None]
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_DT_train, y_DT_train)
    
    best_pipeline = grid_search.best_estimator_
    
    [print("Control Point: 4.0 | Decision Tree Model Prediction") if debug else None]
    y_pred = best_pipeline.predict(X_DT_test)
    
    [print("Control Point: 5.0 | Decision Tree Model Evaluation") if debug else None]
    report = classification_report(y_DT_test, y_pred, output_dict=True)
    statistics_DT = pd.DataFrame(report).transpose()
    
    # Simplifying the model label based on the fixed parameters
    model_label = f"DecisionTree_{subject_label}"
    statistics_DT['Parameters'] = grid_search.best_params_
    statistics_DT['Model'] = f"{subject_label}"
    statistics_DT['Model_Condition'] = model_label
    
    # Displaying the results
    statistics_DT_Detailed = statistics_DT[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]
    print(f"Model Label: {model_label}")
    print(statistics_DT_Detailed)

    [print("Control Point: 10.0 | Decision Tree Model") if debug else None]
    
    statistics_DT_DF = pd.DataFrame({
        "Model": [model_label],
        "Accuracy": [accuracy_score(y_DT_test, y_pred)],
        "Precision": [statistics_DT_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_DT_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_DT_Detailed.loc["weighted avg", "f1-score"]],
        "Parameters": [str(grid_search.best_params_)]
    })
    
    [print("Control Point: 10.1 | Decision Tree Model") if debug else None]

    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_DT_Detailed,
        "Statistics_DF": statistics_DT_DF
    }
    
    [print("Control Point: 10.2 | Decision Tree Model") if debug else None]
    return result_dict


### Random Forest | Advanced | Multilabel

In [None]:
# Required Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd

def prediction_model_RF_multilabel(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension):
    # Enable debug mode for verbose output
    debug = True
    
    # Step 1: Data Preprocessing - Splitting the dataset for training and testing
    X_RF_train, X_RF_test, y_RF_train, y_RF_test = train_test_split(
        X_Gene_Marker_Embeddings, y_Cell_Type, test_size=0.2, random_state=42
    )
    
    # Step 2: Pipeline Setup - Defining preprocessing and model training steps
    pipeline = imblearnPipeline([
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('rf', OneVsRestClassifier(RandomForestClassifier(random_state=42), n_jobs=-1))  # Model definition
    ])
    
    # Step 3: Hyperparameter Tuning - Setting up the grid search
    param_grid = {
        'rf__estimator__n_estimators': [100, 200, 300],
        'rf__estimator__max_depth': [None, 10, 20, 30],
        'rf__estimator__min_samples_leaf': [1, 2, 4],
        'rf__estimator__min_samples_split': [2, 5, 10]
    }
    
    # Step 4: Model Selection - Executing grid search to find the best model
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_RF_train, y_RF_train)
    
    # Step 5: Model Evaluation - Assessing model performance
    best_pipeline = grid_search.best_estimator_
    y_pred = best_pipeline.predict(X_RF_test)
    report = classification_report(y_RF_test, y_pred, output_dict=True, zero_division=0)
    statistics_RF = pd.DataFrame(report).transpose()
    
    # Step 6: Results Compilation - Summarizing the model's performance metrics
    model_label = f"Outlier_{subject_outlier}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_RF['Parameters'] = str(grid_search.best_params_)
    statistics_RF['Model'] = model_label
    statistics_RF['Model_Condition'] = f"{subject_label}_RF"
    
    # Step 7: Detailed Statistics - Preparing a detailed statistics DataFrame
    statistics_RF_Detailed = statistics_RF[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]
    print(f"Model Label: {model_label}")
    print(statistics_RF_Detailed)
    
    # Step 8: Summary Statistics - Creating a summary DataFrame for quick reference
    statistics_RF_DF = pd.DataFrame({
        "Model": [model_label],
        "Accuracy": [accuracy_score(y_RF_test, y_pred)],
        "Precision": [statistics_RF_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_RF_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_RF_Detailed.loc["weighted avg", "f1-score"]],
        "Parameters": [str(grid_search.best_params_)]
    })
    
    # Step 9: Packaging Results - Organizing all relevant model details into a dictionary
    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_RF_Detailed,
        "Statistics_DF": statistics_RF_DF
    }

    # Returning the compiled results for further analysis or reporting
    return result_dict


### Random Forest | Basic | Multilabel

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
from scipy.stats import randint as sp_randint

def optimized_prediction_model_RandomForest(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension):
    debug = True
    [print("Control Point: 1.0 | Random Forest Model") if debug else None]

    X_RF_train, X_RF_test, y_RF_train, y_RF_test = train_test_split(
        X_Gene_Marker_Embeddings, y_Cell_Type, test_size=0.2, random_state=42, stratify=y_Cell_Type
    )
    
    pipeline = imblearnPipeline([
        ('smote', SMOTE(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42))
    ])
    
    param_dist = {
        'rf__n_estimators': sp_randint(100, 400),
        'rf__max_depth': [None] + list(range(5, 26)),
        'rf__min_samples_leaf': sp_randint(1, 11),
        'rf__min_samples_split': sp_randint(2, 11)
    }
    
    [print("Control Point: 3.0 | Random Forest Model Training") if debug else None]
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_RF_train, y_RF_train)
    
    best_pipeline = random_search.best_estimator_
    
    [print("Control Point: 4.0 | Random Forest Model Prediction") if debug else None]
    y_pred = best_pipeline.predict(X_RF_test)
    
    [print("Control Point: 5.0 | Random Forest Model Evaluation") if debug else None]
    report = classification_report(y_RF_test, y_pred, output_dict=True)
    statistics_RF = pd.DataFrame(report).transpose()
    
    model_label = f"RandomForest_{subject_label}"
    statistics_RF['Parameters'] = str(random_search.best_params_)
    statistics_RF['Model'] = model_label
    statistics_RF['Model_Condition'] = f"Outlier_{subject_outlier}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    
    statistics_RF_Detailed = statistics_RF[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]
    print(f"Model Label: {model_label}")
    print(statistics_RF_Detailed)

    [print("Control Point: 10.0 | Random Forest Model") if debug else None]
    
    statistics_RF_DF = pd.DataFrame({  
        "Model": [model_label],
        "Accuracy": [accuracy_score(y_RF_test, y_pred)],
        "Precision": [statistics_RF_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_RF_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_RF_Detailed.loc["weighted avg", "f1-score"]],
        "Parameters": [statistics_RF['Parameters'][0]],
    })

    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_RF_Detailed,
        "Statistics_DF": statistics_RF_DF
    }
    [print("Control Point: 10.2 | Random Forest Model") if debug else None]
    return result_dict


### Gradient Boosting Classifier | Advanced | Multilabel 

In [None]:
# Required Libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd

def prediction_model_GB_multilabel(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension):
    debug = True
    
    # Step 1: Data Preprocessing
    X_GB_train, X_GB_test, y_GB_train, y_GB_test = train_test_split(
        X_Gene_Marker_Embeddings, y_Cell_Type, test_size=0.2, random_state=42, stratify=y_Cell_Type
    )
    
    # Step 2: Pipeline Setup
    pipeline = imblearnPipeline([
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('gb', OneVsRestClassifier(GradientBoostingClassifier(random_state=42)))  # Gradient Boosting classifier
    ])
    
    # Step 3: Hyperparameter Grid Setup
    param_grid = {
        'gb__estimator__n_estimators': [100, 200],
        'gb__estimator__learning_rate': [0.01, 0.1, 0.2],
        'gb__estimator__max_depth': [3, 5, 7],
        'gb__estimator__min_samples_leaf': [1, 2],
        'gb__estimator__min_samples_split': [2, 4]
    }
    
    # Step 4: Model Training
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_GB_train, y_GB_train)
    
    # Step 5: Model Evaluation
    best_pipeline = grid_search.best_estimator_
    y_pred = best_pipeline.predict(X_GB_test)
    report = classification_report(y_GB_test, y_pred, output_dict=True, zero_division=0)
    statistics_GB = pd.DataFrame(report).transpose()
    
    # Step 6: Results Compilation
    model_label = f"Outlier_{subject_outlier}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_GB['Parameters'] = str(grid_search.best_params_)
    statistics_GB['Model'] = model_label
    statistics_GB['Model_Condition'] = f"{subject_label}_GB"
    
    # Step 7: Detailed Statistics
    statistics_GB_Detailed = statistics_GB[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]
    print(f"Model Label: {model_label}")
    print(statistics_GB_Detailed)
    
    # Step 8: Summary Statistics
    statistics_GB_DF = pd.DataFrame({
        "Model": [model_label],
        "Accuracy": [accuracy_score(y_GB_test, y_pred)],
        "Precision": [precision_score(y_GB_test, y_pred, average='weighted', zero_division=0)],
        "Recall": [recall_score(y_GB_test, y_pred, average='weighted', zero_division=0)],
        "F1": [f1_score(y_GB_test, y_pred, average='weighted', zero_division=0)],
        "Parameters": [str(grid_search.best_params_)]
    })
    
    # Step 9: Packaging Results
    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_GB_Detailed,
        "Statistics_DF": statistics_GB_DF
    }

    return result_dict


### Gradient Boosting Classifier | Basic | Multilabel

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
from scipy.stats import randint as sp_randint

def optimized_prediction_model_GradientBoosting(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label):
    debug = True
    [print("Control Point: 1.0 | Gradient Boosting Model") if debug else None]

    X_GB_train, X_GB_test, y_GB_train, y_GB_test = train_test_split(
        X_Gene_Marker_Embeddings, y_Cell_Type, test_size=0.2, random_state=42, stratify=y_Cell_Type
    )

    pipeline = imblearnPipeline([
        ('smote', SMOTE(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42, validation_fraction=0.1, n_iter_no_change=10, tol=0.01))
    ])
    
    param_dist = {
        'gb__n_estimators': sp_randint(50, 300),
        'gb__learning_rate': [0.01, 0.1, 0.2, 0.5],
        'gb__max_depth': sp_randint(3, 10),
        'gb__min_samples_leaf': sp_randint(1, 5),
        'gb__min_samples_split': sp_randint(2, 6)
    }
    
    [print("Control Point: 3.0 | Gradient Boosting Model Training") if debug else None]
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_GB_train, y_GB_train)
    
    best_pipeline = random_search.best_estimator_
    
    [print("Control Point: 4.0 | Gradient Boosting Model Prediction") if debug else None]
    y_pred = best_pipeline.predict(X_GB_test)
    
    [print("Control Point: 5.0 | Gradient Boosting Model Evaluation") if debug else None]
    report = classification_report(y_GB_test, y_pred, output_dict=True)
    statistics_GB = pd.DataFrame(report).transpose()
    
    model_label = f"GradientBoosting_{subject_label}"
    statistics_GB['Parameters'] = str(random_search.best_params_)
    statistics_GB['Model'] = model_label
    statistics_GB['Model_Condition'] = f"Outlier_{subject_outlier}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    
    statistics_GB_Detailed = statistics_GB[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]
    print(f"Model Label: {model_label}")
    print(statistics_GB_Detailed)

    statistics_GB_DF = pd.DataFrame({  
        "Model": [model_label],
        "Accuracy": [accuracy_score(y_GB_test, y_pred)],
        "Precision": [statistics_GB_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_GB_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_GB_Detailed.loc["weighted avg", "f1-score"]],
        "Parameters": [statistics_GB['Parameters'][0]],
    }) 

    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_GB_Detailed,
        "Statistics_DF": statistics_GB_DF
    }
    return result_dict


# End 