In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import torch

def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result

def prediction_model_SGD_multilabel_basic(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension, data_source="one"):
    
    # Convert embeddings to numpy array if it's a tensor
    if isinstance(X_Embeddings_Autoencoder, torch.Tensor):
        X = X_Embeddings_Autoencoder.numpy()
        print("Way A.1")
    else:
        X = np.array(X_Embeddings_Autoencoder).reshape(-1, 1)  # Ensure it's 2D
        print("Way A.2")
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the SGDClassifier with specified hyperparameters
    sgd_params = {'alpha': 0.01, 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
    model = SGDClassifier(**sgd_params, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    # Convert the classification report into a DataFrame
    statistics_SGD = pd.DataFrame(report).transpose()
    
    # Construct a model label
    applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_SGD['Parameters'] = dict_to_str(sgd_params)
    statistics_SGD['Applications'] = applications_label
    statistics_SGD['Applications_Condition'] = f"{subject_label}_SGD"
    
    # Create a summary DataFrame
    statistics_SGD_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": "SGD", 
        "Parameters": [dict_to_str(sgd_params)],
        "Accuracy": [accuracy_score(y_test, y_pred)],
        "Precision": [precision_score(y_test, y_pred, average='weighted', zero_division=0)],
        "Recall": [recall_score(y_test, y_pred, average='weighted', zero_division=0)],
        "F1": [f1_score(y_test, y_pred, average='weighted', zero_division=0)],
    })
    
    # Package results into a dictionary
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_SGD,
        "Statistics_DF": statistics_SGD_DF
    }

    return result_dict


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np

def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result

def prediction_model_SGD_multilabel(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension):
    # Enable debug prints if debug variable is True
    debug = True
    
    # Ensure X_Embeddings_Autoencoder is a proper 2D numpy array
    if isinstance(X_Embeddings_Autoencoder, pd.Series):
        X_Embeddings_Autoencoder = X_Embeddings_Autoencoder.values.reshape(-1, 1)
    elif isinstance(X_Embeddings_Autoencoder, list):
        X_Embeddings_Autoencoder = np.array(X_Embeddings_Autoencoder).reshape(-1, 1)
    
    # Ensure y_Cell_Type is a numpy array
    y_Cell_Type = np.array(y_Cell_Type)
    
    # Step 1: Data Splitting
    # Splitting the dataset into training and testing sets for model validation
    X_train, X_test, y_train, y_test = train_test_split(
        X_Embeddings_Autoencoder, y_Cell_Type, test_size=0.2, random_state=42
    )
    
    # Step 2: Pipeline Creation
    # Defining a machine learning pipeline for preprocessing and model application
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()), # Feature scaling for normalization
        ('smote', SMOTE(random_state=42)), # Applying SMOTE for class imbalance
        ('sgd', OneVsRestClassifier(SGDClassifier(loss='log_loss', random_state=42))) # SGD Classifier for multilabel classification
    ])
    
    # Step 3: Hyperparameter Grid Setup
    # Defining the parameter grid for hyperparameter optimization
    param_grid = {
        'sgd__estimator__alpha': [0.001],#[0.0001, 0.001, 0.01], # Regularization strength
        'sgd__estimator__max_iter': [1000],#[1000, 2000], # Number of iterations for convergence
        'sgd__estimator__tol': [1e-3], #[1e-3, 1e-4], # Tolerance for stopping criteria
        'sgd__estimator__penalty': ['elasticnet']#['l2', 'l1', 'elasticnet'] # Type of regularization
    }
    
    # Step 4: Model Training with Grid Search
    # Configuring and performing grid search cross-validation to find the best model configuration
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Step 5: Best Model Selection
    # Extracting the best estimator after the grid search
    best_model = grid_search.best_estimator_
    
    # Step 6: Model Evaluation
    # Using the best model to predict on the test set and evaluating its performance
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    statistics_SGD = pd.DataFrame(report).transpose()
    
    # Step 7: Statistics Compilation
    # Compiling a model label and detailed statistics for the best model
    model_label = f"Outlier_{subject_outlier}_Normalization_{subject_normalization}_Autoencoder_{subject_autoencoder}_Dimension_{subject_dimension}"
    statistics_SGD['Parameters'] = str(grid_search.best_params_)
    statistics_SGD['Model'] = model_label
    statistics_SGD['Model_Condition'] = f"{subject_label}_SGD" 

    # Step 8: Summary DataFrame Creation
    # Creating a summary DataFrame with the model's performance metrics
    statistics_SGD_DF = pd.DataFrame({
        "Applications": [model_label],
        "Applications_Condition": [subject_label],
        "Model": "SGD", 

        "Parameters": [str(grid_search.best_params_)],
        "Accuracy": [accuracy_score(y_test, y_pred)],
        "Precision": [precision_score(y_test, y_pred, average='weighted')],
        "Recall": [recall_score(y_test, y_pred, average='weighted')],
        "F1": [f1_score(y_test, y_pred, average='weighted')],
        
    })
    
    # Step 9: Result Packaging
    # Packaging the best model, predictions, and performance metrics into a dictionary for output
    result_dict = {
        "Model": best_model,
        "Predictions": y_pred,
        "Statistics": statistics_SGD,
        "Statistics_DF": statistics_SGD_DF
    }

    # Return the result dictionary containing all relevant information
    return result_dict
