In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch

def prediction_model_SVM_multilabel_basic(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension, data_source="one"):
    
    # Convert embeddings to numpy array if it's a tensor
    if isinstance(X_Embeddings_Autoencoder, torch.Tensor):
        X = X_Embeddings_Autoencoder.numpy()
        print("Way A.1")
    else:
        X = np.stack(X_Embeddings_Autoencoder.values)
        print("Way A.2")
        
    y = np.array(y_Cell_Type)  # Assuming Cell_Type is your target array
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the SVM with specified parameters
    svm_params = {
        'kernel': 'poly',
        'degree': 2,
        'coef0': 0.5,
        'C': 1,
        'gamma': 'scale'
    }
    
    # Initializing and training the SVM
    model = SVC(**svm_params, random_state=42)
    model.fit(X_train, y_train)
    
    # Making predictions
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    # Convert the classification report into a DataFrame
    statistics_SVM = pd.DataFrame(report).transpose()
    
    # Construct a model label
    applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_SVM['Parameters'] = dict_to_str(svm_params)
    statistics_SVM['Applications'] = applications_label
    statistics_SVM['Applications_Condition'] = f"{subject_label}_SVM"
    
    # Creating a summary DataFrame
    statistics_SVM_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": "SVM", 
        
        "Parameters": [dict_to_str(svm_params)],
        "Accuracy": [accuracy_score(y_test, y_pred)],
        "Precision": [precision_score(y_test, y_pred, average='weighted', zero_division=0)],
        "Recall": [recall_score(y_test, y_pred, average='weighted', zero_division=0)],
        "F1": [f1_score(y_test, y_pred, average='weighted', zero_division=0)],
    })
    
    # Packaging results
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_SVM,
        "Statistics_DF": statistics_SVM_DF
    }

    return result_dict

# Function to convert dictionary to string
def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result


In [None]:
# Required libraries
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np
import os
import pickle

def prediction_model_SVM_with_GridSearch(X_Embeddings_Autoencoder, y_Cell_Type, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension):
    debug = True
    
    # Convert to numpy array if input is a DataFrame or Series
    if isinstance(X_Embeddings_Autoencoder, pd.DataFrame) or isinstance(X_Embeddings_Autoencoder, pd.Series):
        X_Embeddings_Autoencoder = np.array(X_Embeddings_Autoencoder.tolist())
    if isinstance(y_Cell_Type, pd.Series):
        y_Cell_Type = y_Cell_Type.to_numpy()

    # Step 1: Data Preprocessing
    # Splitting the data into training and testing sets
    X_SVM_train, X_SVM_test, y_SVM_train, y_SVM_test = train_test_split(
        X_Embeddings_Autoencoder, y_Cell_Type, test_size=0.2, random_state=42
    )
    
    # Step 2: Setting up the Machine Learning Pipeline
    # Configuring a pipeline that includes preprocessing and the model
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),  # Normalization
        ('normalizer', MinMaxScaler()),  # Normalization - optional
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('svc', OneVsRestClassifier(SVC(random_state=42, class_weight='balanced'), n_jobs=-1))  # SVM classifier
    ])
    
    # Step 3: Defining Hyperparameters for Grid Search
    # Specifying the grid of parameters to search over
    param_grid = {
        'svc__estimator__degree': [2, 3, 4],                            #[3],
        'svc__estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], # ['poly'],#
        'svc__estimator__coef0': [0.0, 0.5, 1.0],                       #[0.5],#
        'svc__estimator__C': [0.1, 1, 10, 100, 1000],                   #[10],#
        'svc__estimator__gamma': ['scale', 'auto'],                     #['scale']#
    }
    
    # Step 4: Cross-Validation Setup
    # Preparing the cross-validation setup for model selection
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 5: Scoring Metrics
    # Defining the scoring metrics for evaluating models
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'balanced_accuracy': 'balanced_accuracy'
    }
    
    # Step 6: Performing Grid Search
    # Conducting the grid search to find the best model parameters
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_SVM_train, y_SVM_train)  # Training
    
    # Step 7: Model Selection
    # Selecting the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Step 8: Model Evaluation
    # Evaluating the best model on the test set
    y_pred = best_model.predict(X_SVM_test)
    report = classification_report(y_SVM_test, y_pred, output_dict=True)
    statistics_SVM = pd.DataFrame(report).transpose()
    
    # Step 9: Results Compilation
    # Compiling the results into a summary table
    model_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_SVM['Parameters'] = str(grid_search.best_params_)
    statistics_SVM['Model'] = model_label
    statistics_SVM['Model_Condition'] = f"{subject_label}_SVM"

    statistics_SVM_Detailed = statistics_SVM[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Model', 'Model_Condition']]

    # Displaying the detailed results
    print(f"Model Label: {model_label}")
    print(statistics_SVM_Detailed)
    
    # Step 10: Summary DataFrame
    # Creating a DataFrame with summary statistics
    statistics_SVM_DF = pd.DataFrame({
        "Applications": [model_label],
        "Applications_Condition": [subject_label],
        "Model": "SVM", 
        
        "Parameters": [str(grid_search.best_params_)],
        "Accuracy": [statistics_SVM.loc["accuracy", "support"]],
        "Precision": [statistics_SVM.loc["weighted avg", "precision"]],
        "Recall": [statistics_SVM.loc["weighted avg", "recall"]],
        "F1": [statistics_SVM.loc["weighted avg", "f1-score"]],
        
    })
    
    # Packaging the results into a dictionary
    result_dict = {
        "Model": best_model,
        "Predictions": y_pred,
        "Statistics": statistics_SVM_Detailed,
        "Statistics_DF": statistics_SVM_DF
    }

    return result_dict  # Returning the result dictionary
