In [1]:
# Create a str from keys and values of dict
def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result 


# `Functions`
- 1-) SVM
- 2-) SGD
- 3-) Decision Tree
- 4-) Random Forest
- 5- Gradient Boosting

## 0-) `Neural Network`

In [None]:
None

## 1-) `SVM`

In [2]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import torch

def prediction_model_SVM_multilabel_basic(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension,
                                         data_source = "one"):
    
    if data_source == "one": 
        # Convert embeddings to numpy array if it's a tensor
        if isinstance(X_Gene_Marker_Embeddings, torch.Tensor):
            X = X_Gene_Marker_Embeddings.numpy()
        else:
            X = X_Gene_Marker_Embeddings
        
    else:
        # Separate expression values and embeddings, and combine into a single array
        expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
        embeddings  = np.array([item[1] for item in X_Gene_Marker_Embeddings])
        X = np.hstack([expressions.reshape(-1, 1), embeddings])
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Define the SVM with specified parameters
    svm_params = {
        'kernel': 'poly',
        'degree': 2,
        'coef0': 0.5,
        'C': 1,
        'gamma': 'scale'
    }

    # Initialize K-Fold cross-validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    all_reports = []

    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        print(f"\nTraining on fold {fold}...")

        # Split data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initializing and training the SVM
        model = SVC(**svm_params, random_state=42)
        model.fit(X_train, y_train)
        
        # Making predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        all_reports.append(report)
        statistics_SVM = pd.DataFrame(report).transpose()
        
        # Construct a model label
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        statistics_SVM['Parameters'] = dict_to_str(svm_params)
        statistics_SVM['Applications'] = applications_label
        statistics_SVM['Applications_Condition'] = f"{subject_label}_SVM"
        
        scores.append({
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
            "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
            "F1": f1_score(y_test, y_pred, average='weighted', zero_division=0)
        })
    
    # Step 5: Reporting results
    average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
    print(f"\nAverage Test Scores across {n_splits} folds: {average_scores}")

    # Creating a summary DataFrame
    statistics_SVM_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": ["SVM"],
        "Parameters": [dict_to_str(svm_params)],
        "Accuracy": [average_scores['Accuracy']],
        "Precision": [average_scores['Precision']],
        "Recall": [average_scores['Recall']],
        "F1": [average_scores['F1']],
    })

    # Packaging results
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_SVM,
        "Statistics_DF": statistics_SVM_DF,
        "All_Reports": all_reports  # Include all reports for detailed analysis
    }

    return result_dict


## 2-) `SGD`

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import torch

def prediction_model_SGD_multilabel_basic(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension,
                                         data_source = "one"):
    
    if data_source == "one": 
        # Convert embeddings to numpy array if it's a tensor
        if isinstance(X_Gene_Marker_Embeddings, torch.Tensor):
            X = X_Gene_Marker_Embeddings.numpy()
        else:
            X = X_Gene_Marker_Embeddings
        
    else:
        # Separate expression values and embeddings, and combine into a single array
        expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
        embeddings  = np.array([item[1] for item in X_Gene_Marker_Embeddings])
        X = np.hstack([expressions.reshape(-1, 1), embeddings])
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Define the SGDClassifier with specified hyperparameters
    sgd_params = {'alpha': 0.01, 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}

    # Initialize K-Fold cross-validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    all_reports = []

    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        print(f"\nTraining on fold {fold}...")

        # Split data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and train the SGD model
        model = SGDClassifier(**sgd_params, random_state=42)
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        all_reports.append(report)
        statistics_SGD = pd.DataFrame(report).transpose()
        
        # Construct a model label
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        statistics_SGD['Parameters'] = dict_to_str(sgd_params)
        statistics_SGD['Applications'] = applications_label
        statistics_SGD['Applications_Condition'] = f"{subject_label}_SGD"
        
        scores.append({
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
            "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
            "F1": f1_score(y_test, y_pred, average='weighted', zero_division=0)
        })
    
    # Step 5: Reporting results
    average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
    print(f"\nAverage Test Scores across {n_splits} folds: {average_scores}")

    # Creating a summary DataFrame
    statistics_SGD_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": ["SGD"],
        "Parameters": [dict_to_str(sgd_params)],
        "Accuracy": [average_scores['Accuracy']],
        "Precision": [average_scores['Precision']],
        "Recall": [average_scores['Recall']],
        "F1": [average_scores['F1']],
    })

    # Packaging results
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_SGD,
        "Statistics_DF": statistics_SGD_DF,
        "All_Reports": all_reports  # Include all reports for detailed analysis
    }

    return result_dict

# Utility function to convert dictionary to string (for completeness)
def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result  


## 3-) `Decision Tree`

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import torch

def prediction_model_DT_multilabel_basic(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension,
                                         data_source = "one"):
    
    if data_source == "one": 
        # Convert embeddings to numpy array if it's a tensor
        if isinstance(X_Gene_Marker_Embeddings, torch.Tensor):
            X = X_Gene_Marker_Embeddings.numpy()
        else:
            X = X_Gene_Marker_Embeddings
        
    else:
        # Separate expression values and embeddings, and combine into a single array
        expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
        embeddings  = np.array([item[1] for item in X_Gene_Marker_Embeddings])
        X = np.hstack([expressions.reshape(-1, 1), embeddings])
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Initialize K-Fold cross-validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    all_reports = []

    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        print(f"\nTraining on fold {fold}...")

        # Split data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initializing and training the DecisionTreeClassifier
        model = DecisionTreeClassifier(random_state=42)
        model.fit(X_train, y_train)
        
        # Making predictions on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        all_reports.append(report)
        statistics_DT = pd.DataFrame(report).transpose()
        
        # Constructing a model label based on the provided parameters
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        statistics_DT['Parameters'] = "Basic"
        statistics_DT['Applications'] = applications_label
        statistics_DT['Applications_Condition'] = f"{subject_label}_DT"
        
        scores.append({
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
            "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
            "F1": f1_score(y_test, y_pred, average='weighted', zero_division=0)
        })
    
    # Step 5: Reporting results
    average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
    print(f"\nAverage Test Scores across {n_splits} folds: {average_scores}")

    # Creating a summary DataFrame
    statistics_DT_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": ["DT"],
        "Parameters": ["Basic"],
        "Accuracy": [average_scores['Accuracy']],
        "Precision": [average_scores['Precision']],
        "Recall": [average_scores['Recall']],
        "F1": [average_scores['F1']],
    })

    # Packaging results
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_DT,
        "Statistics_DF": statistics_DT_DF,
        "All_Reports": all_reports  # Include all reports for detailed analysis
    }

    return result_dict

# Utility function to convert dictionary to string (for completeness)
def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result  


## 4-) `Random Forest`

In [4]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import torch

def prediction_model_RF_multilabel_basic(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension, 
                                         data_source = "one"):
    
    if data_source == "one": 
        # Convert embeddings to numpy array if it's a tensor
        if isinstance(X_Gene_Marker_Embeddings, torch.Tensor):
            X = X_Gene_Marker_Embeddings.numpy()
        else:
            X = X_Gene_Marker_Embeddings
        
    else:
        # Separate expression values and embeddings, and combine into a single array
        expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
        embeddings  = np.array([item[1] for item in X_Gene_Marker_Embeddings])
        X = np.hstack([expressions.reshape(-1, 1), embeddings])
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Initialize K-Fold cross-validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    all_reports = []

    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        print(f"\nTraining on fold {fold}...")

        # Split data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train a RandomForestClassifier
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        all_reports.append(report)
        statistics_RF = pd.DataFrame(report).transpose()
        
        # Construct a model label
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        statistics_RF['Parameters'] = "Basic"
        statistics_RF['Applications'] = applications_label
        statistics_RF['Applications_Condition'] = f"{subject_label}_RF"
        
        scores.append({
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
            "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
            "F1": f1_score(y_test, y_pred, average='weighted', zero_division=0)
        })
    
    # Step 5: Reporting results
    average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
    print(f"\nAverage Test Scores across {n_splits} folds: {average_scores}")

    # Creating a summary DataFrame
    statistics_RF_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": ["RF"],
        "Parameters": ["Basic"],
        "Accuracy": [average_scores['Accuracy']],
        "Precision": [average_scores['Precision']],
        "Recall": [average_scores['Recall']],
        "F1": [average_scores['F1']],
    })

    # Packaging results
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_RF,
        "Statistics_DF": statistics_RF_DF,
        "All_Reports": all_reports  # Include all reports for detailed analysis
    }

    return result_dict

# Utility function to convert dictionary to string (for completeness)
def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result  


## 5-) `Gradient Boosting`

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch

def prediction_model_GB_multilabel_basic(X_Gene_Marker_Embeddings, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension, 
                                         data_source = "one" ,  ):
    
    if data_source == "one": 
        # Convert embeddings to numpy array if it's a tensor
        if isinstance(X_Gene_Marker_Embeddings, torch.Tensor):
            X = X_Gene_Marker_Embeddings.numpy()
        else:
            X = X_Gene_Marker_Embeddings
        
    else:
        # Separate expression values and embeddings, and combine into a single array
        expressions = np.array([item[0] for item in X_Gene_Marker_Embeddings])
        embeddings  = np.array([item[1] for item in X_Gene_Marker_Embeddings])
        X = np.hstack([expressions.reshape(-1, 1), embeddings])

        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Gradient Boosting model parameters
    gb_params = {
        'learning_rate': 0.2,
        'max_depth': 7,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'n_estimators': 100
    }
    
    # Initializing and training the GradientBoostingClassifier
    model = GradientBoostingClassifier(**gb_params, random_state=42)
    model.fit(X_train, y_train)
    
    # Making predictions on the test set
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    # Convert the classification report into a DataFrame for detailed statistics
    statistics_GB = pd.DataFrame(report).transpose()
    
    # Constructing a model label based on the provided parameters

    applications_label                       = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_GB['Parameters']             = dict_to_str(gb_params) 
    statistics_GB['Applications']           = applications_label
    statistics_GB['Applications_Condition'] = f"{subject_label}_GB"
    
    # Creating a summary DataFrame for an overview of the model's performance
    statistics_GB_DF = pd.DataFrame({        

        "Applications": [ applications_label ],
        "Applications_Condition": [subject_label],
        "Model": "GB", 
        "Parameters": [ dict_to_str(gb_params) ],

        "Accuracy": [accuracy_score(y_test, y_pred)],
        "Precision": [precision_score(y_test, y_pred, average='weighted', zero_division=0)],
        "Recall": [recall_score(y_test, y_pred, average='weighted', zero_division=0)],
        "F1": [f1_score(y_test, y_pred, average='weighted', zero_division=0)],
        
    })
    
    # Packaging results into a dictionary for easy access and interpretation
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_GB,
        "Statistics_DF": statistics_GB_DF
    }

    return result_dict


# `End`