In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

def dict_to_str(d):
    return ', '.join([f"{k}={v}" for k, v in d.items()])

def prediction_model_SVM_multilabel_expression_onehot(X, y, subject_label, subject_outlier, subject_normalization, subject_autoencoder, subject_dimension, n_splits=5):
    # Convert y to numpy array if it's not already
    y = np.array(y)
    
    # Define the SVM with specified parameters
    svm_params = {
        'kernel': 'poly',
        'degree': 2,
        'coef0': 0.5,
        'C': 1,
        'gamma': 'scale'
    }

    # Initialize K-Fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    all_reports = []

    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        print(f"\nTraining on fold {fold}...")

        # Split data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Initializing and training the SVM
        model = SVC(**svm_params, random_state=42)
        model.fit(X_train, y_train)
        
        # Making predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        all_reports.append(report)
        statistics_SVM = pd.DataFrame(report).transpose()
        
        # Construct a model label
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        statistics_SVM['Parameters'] = dict_to_str(svm_params)
        statistics_SVM['Applications'] = applications_label
        statistics_SVM['Applications_Condition'] = f"{subject_label}_SVM"
        
        scores.append({
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
            "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
            "F1": f1_score(y_test, y_pred, average='weighted', zero_division=0)
        })
    
    # Step 5: Reporting results
    average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
    print(f"\nAverage Test Scores across {n_splits} folds: {average_scores}")

    # Creating a summary DataFrame
    statistics_SVM_DF = pd.DataFrame({
        "Applications": [applications_label],
        "Applications_Condition": [subject_label],
        "Model": ["SVM"],
        "Parameters": [dict_to_str(svm_params)],
        "Accuracy": [average_scores['Accuracy']],
        "Precision": [average_scores['Precision']],
        "Recall": [average_scores['Recall']],
        "F1": [average_scores['F1']],
    })

    # Packaging results
    result_dict = {
        "Model": model,
        "Predictions": y_pred,
        "Statistics": statistics_SVM,
        "Statistics_DF": statistics_SVM_DF,
        "All_Reports": all_reports  # Include all reports for detailed analysis
    }

    return result_dict


In [None]:

# Define SVM function with GridSearchCV and Pipeline
def prediction_model_SVM_with_GridSearch(X, y, subject_label, subject_outlier, subject_autoencoder, subject_dimension, subject_normalization):
    # Step 1: Data Preprocessing
    y = np.array(y)

    # Splitting the data into training and testing sets
    X_SVM_train, X_SVM_test, y_SVM_train, y_SVM_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Step 2: Setting up the Machine Learning Pipeline
    pipeline = imblearnPipeline([
        ('scaler', StandardScaler()),  # Feature scaling for normalization
        ('smote', SMOTE(random_state=42)),  # Handling class imbalance
        ('svc', OneVsRestClassifier(SVC(random_state=42, class_weight='balanced'), n_jobs=-1))  # SVM classifier
    ])
    
    # Step 3: Defining Hyperparameters for Grid Search
    param_grid = {
        'svc__estimator__degree': [2, 3],
        'svc__estimator__kernel': ['poly', 'rbf'],
        'svc__estimator__coef0': [0.5, 1],
        'svc__estimator__C': [1, 10],
        'svc__estimator__gamma': ['scale']
    }
    
    # Step 4: Cross-Validation Setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Step 5: Scoring Metrics
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision_macro': make_scorer(precision_score, average='macro'),
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_weighted': make_scorer(f1_score, average='weighted'),
        'balanced_accuracy': 'balanced_accuracy'
    }
    
    # Step 6: Performing Grid Search
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_SVM_train, y_SVM_train)  # Training
    
    # Step 7: Model Selection
    best_model = grid_search.best_estimator_
    
    # Step 8: Model Evaluation
    y_pred = best_model.predict(X_SVM_test)
    report = classification_report(y_SVM_test, y_pred, output_dict=True, zero_division=0)
    statistics_SVM = pd.DataFrame(report).transpose()
    
    # Step 9: Results Compilation for Each Combination
    all_results = []
    for mean_score, params in zip(grid_search.cv_results_['mean_test_f1_weighted'], grid_search.cv_results_['params']):
        applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
        all_results.append({
            "Applications": applications_label,
            "Applications_Condition": f"{subject_label}_SVM",
            "Model": "SVM",
            "Parameters": dict_to_str(params),
            "Accuracy": grid_search.cv_results_['mean_test_accuracy'][grid_search.cv_results_['params'].index(params)],
            "Precision": grid_search.cv_results_['mean_test_precision_macro'][grid_search.cv_results_['params'].index(params)],
            "Recall": grid_search.cv_results_['mean_test_recall_macro'][grid_search.cv_results_['params'].index(params)],
            "F1": mean_score
        })

    # Create a DataFrame to store all the results
    statistics_SVM_DF = pd.DataFrame(all_results)
    
    # Compile the detailed results for the best model
    model_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
    statistics_SVM['Parameters'] = str(grid_search.best_params_)
    statistics_SVM['Applications'] = model_label
    statistics_SVM['Applications_Condition'] = f"{subject_label}_SVM"
    statistics_SVM_Detailed = statistics_SVM[['precision', 'recall', 'f1-score', 'support', 'Parameters', 'Applications', 'Applications_Condition']]

    # Displaying the detailed results
    print(f"Model Label: {model_label}")
    print(statistics_SVM_Detailed)
    
    # Step 10: Summary DataFrame for the Best Model
    summary_statistics_SVM_DF = pd.DataFrame({
        "Applications": [f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"],
        "Applications_Condition": [subject_label],
        "Model": ["SVM"],
        "Parameters": [dict_to_str(grid_search.best_params_)],
        "Accuracy": [statistics_SVM_Detailed.loc["accuracy", "f1-score"]],
        "Precision": [statistics_SVM_Detailed.loc["weighted avg", "precision"]],
        "Recall": [statistics_SVM_Detailed.loc["weighted avg", "recall"]],
        "F1": [statistics_SVM_Detailed.loc["weighted avg", "f1-score"]],
    })
    
    # Packaging the results into a dictionary
    result_dict = {
        "Model": best_model,
        "Predictions": y_pred,
        "Statistics": statistics_SVM_Detailed,
        "Statistics_DF": summary_statistics_SVM_DF,
        "All_Results": statistics_SVM_DF  # Include all combinations' results
    }

    return result_dict  # Returning the result dictionary
