In [None]:
print("Tools of ML: Complex")

# `Functions`
- 1-) SVM
- 2-) SGD
- 3-) Decision Tree
- 4-) Random Forest
- 5- Gradient Boosting

## 1-) `SVM`

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np
import os
import pickle

def dict_to_str(d):
    return ', '.join(f'{k}: {v}' for k, v in d.items())

def preprocess_data(X_Gene_Marker_Embeddings):
    expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
    embeddings = np.array([item[1] for item in X_Gene_Marker_Embeddings])
    X = np.hstack([expressions.reshape(-1, 1), embeddings])
    return X

def prediction_model_SVM_with_GridSearch(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension, subject_normalization):
    debug = True
    
    # Step 1: Data Preprocessing
    X = preprocess_data(X_Gene_Marker_Embeddings)
    y = np.array(y_Cell_Type)

    # Splitting the data into training and testing sets
    X_SVM_train, X_SVM_test, y_SVM_train, y_SVM_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )  # Note: For multi-label data, stratify might not work as expected
    
    # Step 2: Setting up the Machine Learning Pipeline
    # Configuring a pipeline that includes preprocessing and the model
    pipeline = imblearnPipeline([
        #('scaler', StandardScaler()),  # Normalization
        #('normalizer', MinMaxScaler()),  # Normalization - optional
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('svc', OneVsRestClassifier(SVC(random_state=42, class_weight='balanced'), n_jobs=-1))  # SVM classifier
    ])
    
    # Step 3: Defining Hyperparameters for Grid Search
    # Specifying the grid of parameters to search over
    param_grid = {
        'svc__estimator__degree': [2,3],#[2, 3, 4],
        'svc__estimator__kernel': ['poly', 'rbf'],#['linear', 'poly', 'rbf', 'sigmoid'],
        'svc__estimator__coef0': [ 0.5, 1], #[0.0, 0.5, 1.0],
        'svc__estimator__C': [1,10],#[0.1, 1, 10, 100, 1000],
        'svc__estimator__gamma': ['scale']
    }
    
    # Step 4: Cross-Validation Setup
    # Preparing the cross-validation setup for model selection
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 5: Scoring Metrics
    # Defining the scoring metrics for evaluating models
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'balanced_accuracy': 'balanced_accuracy'
    }
    
    # Step 6: Performing Grid Search
    # Conducting the grid search to find the best model parameters
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_SVM_train, y_SVM_train)  # Training
    
    # Step 7: Model Selection
    # Selecting the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Step 8: Model Evaluation
    # Evaluating the best model on the test set
    y_pred = best_model.predict(X_SVM_test)
    report = classification_report(y_SVM_test, y_pred, output_dict=True)
    statistics_SVM = pd.DataFrame(report).transpose()
    
    # Step 9: Results Compilation for Each Combination
    # Compiling the results for each parameter combination
    all_results = []
    for mean_score, params in zip(grid_search.cv_results_['mean_test_f1_weighted'], grid_search.cv_results_['params']):
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        all_results.append({
            "Applications": applications_label,
            "Applications_Condition": f"{subject_label}_SVM",
            "Model": "SVM",
            "Parameters": dict_to_str(params),
            "Accuracy": grid_search.cv_results_['mean_test_accuracy'][grid_search.cv_results_['params'].index(params)],
            "Precision": grid_search.cv_results_['mean_test_precision_macro'][grid_search.cv_results_['params'].index(params)],
            "Recall": grid_search.cv_results_['mean_test_recall_macro'][grid_search.cv_results_['params'].index(params)],
            "F1": mean_score
        })

    # Create a DataFrame to store all the results
    statistics_SVM_DF = pd.DataFrame(all_results)
    
    # Compile the detailed results for the best model
    model_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_SVM['Parameters'] = str(grid_search.best_params_)
    statistics_SVM['Applications'] = model_label
    statistics_SVM['Applications_Condition'] = f"{subject_label}_SVM"
    statistics_SVM_Detailed = statistics_SVM[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Applications', 'Applications_Condition']]

    # Displaying the detailed results
    print(f"Model Label: {model_label}")
    print(statistics_SVM_Detailed)
    
    # Step 10: Summary DataFrame for the Best Model
    summary_statistics_SVM_DF = pd.DataFrame({
        "Applications": [f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"],
        "Applications_Condition": [subject_label],
        "Model": ["SVM"],
        "Parameters": [dict_to_str(grid_search.best_params_)],
        "Accuracy": [statistics_SVM_Detailed.loc["accuracy", "f1-score"]],
        "Precision": [statistics_SVM_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_SVM_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_SVM_Detailed.loc["weighted avg", "f1-score"]],
    })
    
    # Packaging the results into a dictionary
    result_dict = {
        "Model": best_model,
        "Predictions": y_pred,
        "Statistics": statistics_SVM_Detailed,
        "Statistics_DF": summary_statistics_SVM_DF,
        "All_Results": statistics_SVM_DF  # Include all combinations' results
    }

    return result_dict  # Returning the result dictionary


## 2-) `SGD`

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np

def dict_to_str(d):
    return ', '.join(f'{k}: {v}' for k, v in d.items())

def preprocess_data(X_Gene_Marker_Embeddings):
    expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
    embeddings = np.array([item[1] for item in X_Gene_Marker_Embeddings])
    X = np.hstack([expressions.reshape(-1, 1), embeddings])
    return X

def prediction_model_SGD_multilabel(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension, subject_normalization):
    # Step 1: Data Preprocessing
    X = preprocess_data(X_Gene_Marker_Embeddings)
    y = np.array(y_Cell_Type)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )  # Note: For multi-label data, stratify might not work as expected
    
    # Step 2: Setting up the Machine Learning Pipeline
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),  # Feature scaling for normalization
        ('smote', SMOTE(random_state=42)),  # Applying SMOTE for class imbalance
        ('sgd', OneVsRestClassifier(SGDClassifier(loss='log_loss', random_state=42)))  # SGD Classifier for multilabel classification
    ])
    
    # Step 3: Hyperparameter Grid Setup
    param_grid = {
        'sgd__estimator__alpha': [0.0001, 0.001, 0.01],  # Regularization strength
        'sgd__estimator__max_iter': [1000, 2000],  # Number of iterations for convergence
        'sgd__estimator__tol': [1e-3, 1e-4],  # Tolerance for stopping criteria
        'sgd__estimator__penalty': ['l2', 'l1', 'elasticnet']  # Type of regularization
    }
    
    # Step 4: Cross-Validation Setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 5: Scoring Metrics
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'balanced_accuracy': 'balanced_accuracy'
    }
    
    # Step 6: Performing Grid Search
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)  # Training
    
    # Step 7: Model Selection
    best_model = grid_search.best_estimator_
    
    # Step 8: Model Evaluation
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    statistics_SGD = pd.DataFrame(report).transpose()
    
    # Step 9: Results Compilation for Each Combination
    all_results = []
    for mean_score, params in zip(grid_search.cv_results_['mean_test_f1_weighted'], grid_search.cv_results_['params']):
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        all_results.append({
            "Applications": applications_label,
            "Applications_Condition": f"{subject_label}_SGD",
            "Model": "SGD",
            "Parameters": dict_to_str(params),
            "Accuracy": grid_search.cv_results_['mean_test_accuracy'][grid_search.cv_results_['params'].index(params)],
            "Precision": grid_search.cv_results_['mean_test_precision_macro'][grid_search.cv_results_['params'].index(params)],
            "Recall": grid_search.cv_results_['mean_test_recall_macro'][grid_search.cv_results_['params'].index(params)],
            "F1": mean_score
        })

    # Create a DataFrame to store all the results
    statistics_SGD_DF = pd.DataFrame(all_results)
    
    # Compile the detailed results for the best model
    model_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_SGD['Parameters'] = dict_to_str(grid_search.best_params_)
    statistics_SGD['Applications'] = model_label
    statistics_SGD['Applications_Condition'] = f"{subject_label}_SGD"
    statistics_SGD_Detailed = statistics_SGD[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Applications', 'Applications_Condition']]

    # Displaying the detailed results
    print(f"Model Label: {model_label}")
    print(statistics_SGD_Detailed)
    
    # Step 10: Summary DataFrame for the Best Model
    summary_statistics_SGD_DF = pd.DataFrame({
        "Applications": [f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"],
        "Applications_Condition": [subject_label],
        "Model": ["SGD"],
        "Parameters": [dict_to_str(grid_search.best_params_)],
        "Accuracy": [statistics_SGD_Detailed.loc["accuracy", "f1-score"]],
        "Precision": [statistics_SGD_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_SGD_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_SGD_Detailed.loc["weighted avg", "f1-score"]],
    })
    
    # Packaging the results into a dictionary
    result_dict = {
        "Model": best_model,
        "Predictions": y_pred,
        "Statistics": statistics_SGD_Detailed,
        "Statistics_DF": summary_statistics_SGD_DF,
        "All_Results": statistics_SGD_DF  # Include all combinations' results
    }

    return result_dict  # Returning the result dictionary


## 3-) `Decision Tree`

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.multiclass import OneVsRestClassifier
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

def dict_to_str(d):
    return ', '.join(f'{k}: {v}' for k, v in d.items())

def preprocess_data(X_Gene_Marker_Embeddings):
    expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
    embeddings = np.array([item[1] for item in X_Gene_Marker_Embeddings])
    X = np.hstack([expressions.reshape(-1, 1), embeddings])
    return X

def prediction_model_DT_multilabel(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension, subject_normalization):
    debug = True

    # Step 1: Data Preprocessing
    X = preprocess_data(X_Gene_Marker_Embeddings)
    y = np.array(y_Cell_Type)

    # Splitting the data into training and testing sets
    X_DT_train, X_DT_test, y_DT_train, y_DT_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )  # Note: For multi-label data, stratify might not work as expected

    # Step 2: Setting up the Machine Learning Pipeline
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),  # Feature scaling for normalization
        ('smote', SMOTE(random_state=42)),  # Applying SMOTE for class imbalance
        ('clf', OneVsRestClassifier(DecisionTreeClassifier(random_state=42)))  # Decision Tree Classifier for multilabel classification
    ])

    # Step 3: Hyperparameter Grid Setup
    param_grid = {
        'clf__estimator__max_depth': [10],#[None, 10, 20, 30],
        'clf__estimator__min_samples_leaf': [2], #[1, 2, 4],
        'clf__estimator__min_samples_split': [2] #[2, 5, 10]
    }

    # Step 4: Cross-Validation Setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Step 5: Scoring Metrics
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'balanced_accuracy': 'balanced_accuracy'
    }

    # Step 6: Performing Grid Search
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_DT_train, y_DT_train)

    # Step 7: Model Selection
    best_pipeline = grid_search.best_estimator_

    # Step 8: Model Evaluation
    y_pred = best_pipeline.predict(X_DT_test)
    report = classification_report(y_DT_test, y_pred, output_dict=True, zero_division=0)
    statistics_DT = pd.DataFrame(report).transpose()

    # Step 9: Results Compilation for Each Combination
    all_results = []
    for mean_score, params in zip(grid_search.cv_results_['mean_test_f1_weighted'], grid_search.cv_results_['params']):
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        all_results.append({
            "Applications": applications_label,
            "Applications_Condition": f"{subject_label}_DT",
            "Model": "Decision Tree",
            "Parameters": dict_to_str(params),
            "Accuracy": grid_search.cv_results_['mean_test_accuracy'][grid_search.cv_results_['params'].index(params)],
            "Precision": grid_search.cv_results_['mean_test_precision_macro'][grid_search.cv_results_['params'].index(params)],
            "Recall": grid_search.cv_results_['mean_test_recall_macro'][grid_search.cv_results_['params'].index(params)],
            "F1": mean_score
        })

    # Create a DataFrame to store all the results
    statistics_DT_DF = pd.DataFrame(all_results)

    # Compile the detailed results for the best model
    model_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_DT['Parameters'] = dict_to_str(grid_search.best_params_)
    statistics_DT['Applications'] = model_label
    statistics_DT['Applications_Condition'] = f"{subject_label}_DT"
    statistics_DT_Detailed = statistics_DT[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Applications', 'Applications_Condition']]

    # Displaying the detailed results
    print(f"Model Label: {model_label}")
    print(statistics_DT_Detailed)

    # Step 10: Summary DataFrame for the Best Model
    summary_statistics_DT_DF = pd.DataFrame({
        "Applications": [f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"],
        "Applications_Condition": [subject_label],
        "Model": ["DT"],
        "Parameters": [dict_to_str(grid_search.best_params_)],
        "Accuracy": [statistics_DT_Detailed.loc["accuracy", "f1-score"]],
        "Precision": [statistics_DT_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_DT_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_DT_Detailed.loc["weighted avg", "f1-score"]],
    })

    # Packaging the results into a dictionary
    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_DT_Detailed,
        "Statistics_DF": summary_statistics_DT_DF,
        "All_Results": statistics_DT_DF  # Include all combinations' results
    }

    return result_dict  # Returning the result dictionary


## 4-) `Random Forest`

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, make_scorer
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np

def dict_to_str(d):
    return ', '.join(f'{k}: {v}' for k, v in d.items())

def preprocess_data(X_Gene_Marker_Embeddings):
    expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
    embeddings = np.array([item[1] for item in X_Gene_Marker_Embeddings])
    X = np.hstack([expressions.reshape(-1, 1), embeddings])
    return X

def prediction_model_RF_multilabel(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_autoencoder, subject_dimension, subject_normalization):
    debug = True
    
    # Step 1: Data Preprocessing
    X = preprocess_data(X_Gene_Marker_Embeddings)
    y = np.array(y_Cell_Type)

    # Splitting the data into training and testing sets
    X_RF_train, X_RF_test, y_RF_train, y_RF_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )  # Note: For multi-label data, stratify might not work as expected
    
    # Step 2: Setting up the Machine Learning Pipeline
    pipeline = imblearnPipeline([
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('rf', OneVsRestClassifier(RandomForestClassifier(random_state=42), n_jobs=-1))  # Model definition
    ])
    
    # Step 3: Hyperparameter Tuning
    param_grid = {
        'rf__estimator__n_estimators': [100], #[100, 200, 300],
        'rf__estimator__max_depth': [10], #[None, 10, 20, 30],
        'rf__estimator__min_samples_leaf': [2],   #[1, 2, 4],
        'rf__estimator__min_samples_split': [2]  #[2, 5, 10]
    }

    # Step 4: Cross-Validation Setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 5: Scoring Metrics
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'balanced_accuracy': 'balanced_accuracy'
    }
    
    # Step 6: Performing Grid Search
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_RF_train, y_RF_train)

    # Step 7: Model Selection
    best_pipeline = grid_search.best_estimator_

    # Step 8: Model Evaluation
    y_pred = best_pipeline.predict(X_RF_test)
    report = classification_report(y_RF_test, y_pred, output_dict=True, zero_division=0)
    statistics_RF = pd.DataFrame(report).transpose()

    # Step 9: Results Compilation for Each Combination
    all_results = []
    for mean_score, params in zip(grid_search.cv_results_['mean_test_f1_weighted'], grid_search.cv_results_['params']):
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        all_results.append({
            "Applications": applications_label,
            "Applications_Condition": f"{subject_label}_RF",
            "Model": "Random Forest",
            "Parameters": dict_to_str(params),
            "Accuracy": grid_search.cv_results_['mean_test_accuracy'][grid_search.cv_results_['params'].index(params)],
            "Precision": grid_search.cv_results_['mean_test_precision_macro'][grid_search.cv_results_['params'].index(params)],
            "Recall": grid_search.cv_results_['mean_test_recall_macro'][grid_search.cv_results_['params'].index(params)],
            "F1": mean_score
        })

    # Create a DataFrame to store all the results
    statistics_RF_DF = pd.DataFrame(all_results)

    # Compile the detailed results for the best model
    model_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_RF['Parameters'] = dict_to_str(grid_search.best_params_)
    statistics_RF['Applications'] = model_label
    statistics_RF['Applications_Condition'] = f"{subject_label}_RF"
    statistics_RF_Detailed = statistics_RF[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Applications', 'Applications_Condition']]

    # Displaying the detailed results
    print(f"Model Label: {model_label}")
    print(statistics_RF_Detailed)

    # Step 10: Summary DataFrame for the Best Model
    summary_statistics_RF_DF = pd.DataFrame({
        "Applications": [f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"],
        "Applications_Condition": [subject_label],
        "Model": ["RF"],
        "Parameters": [dict_to_str(grid_search.best_params_)],
        "Accuracy": [statistics_RF_Detailed.loc["accuracy", "f1-score"]],
        "Precision": [statistics_RF_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_RF_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_RF_Detailed.loc["weighted avg", "f1-score"]],
    })

    # Packaging the results into a dictionary
    result_dict = {
        "Model": best_pipeline,
        "Predictions": y_pred,
        "Statistics": statistics_RF_Detailed,
        "Statistics_DF": summary_statistics_RF_DF,
        "All_Results": statistics_RF_DF  # Include all combinations' results
    }

    return result_dict  # Returning the result dictionary


# `End`