#### Import packeges

In [7]:
import os
import glob
import rasterio
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score, confusion_matrix
import joblib
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt # For Confusion Matrix visualization
import seaborn as sns
from pathlib import Path

#### Build SVM Model

In [4]:
def train_and_evaluate_svm(X, y, X_test, y_test, joblib_file, k_folds=5, kernel='rbf', C=1.0, gamma='scale', class_weight='balanced'):
    """Trains and evaluates an SVM model with k-fold cross-validation, logging metrics and artifacts to MLflow."""
    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("kernel", kernel)
        mlflow.log_param("C", C)
        mlflow.log_param("gamma", gamma)
        mlflow.log_param("k_folds", k_folds)
        mlflow.log_param("class_weight", class_weight)

        # Create the SVM model
        model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=42, class_weight=class_weight) #class_weight='balanced'

        # Perform k-fold cross-validation (optional, but recommended)
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X, y, cv=kf, scoring='f1_weighted') # Use 'f1_weighted' for imbalanced data
        mlflow.log_metric("mean_cv_f1", np.mean(cv_scores))
        mlflow.log_metric("std_cv_f1", np.std(cv_scores))
        print(f"Cross-validation F1 scores: {cv_scores}")
        print(f"Mean cross-validation F1 accuracy: {np.mean(cv_scores)}")
        print(f"Standard deviation of cross-validation F1 accuracy: {np.std(cv_scores)}")

        # Train the model on the entire training + validation dataset
        model.fit(X, y)

        # Evaluate the model on the test set
        y_test_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_report = classification_report(y_test, y_test_pred)

        # Calculate additional metrics
        test_precision = precision_score(y_test, y_test_pred, average='weighted')
        test_recall = recall_score(y_test, y_test_pred, average='weighted')
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')
        confusion = confusion_matrix(y_test, y_test_pred)

        # Log metrics
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("test_precision", test_precision)
        mlflow.log_metric("test_recall", test_recall)
        mlflow.log_metric("test_f1", test_f1)

        # Log the classification report and confusion matrix
        mlflow.log_text(test_report, "test_classification_report.txt")

        # Log confusion matrix as an image
        plt.figure(figsize=(8, 6))
        sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.savefig("confusion_matrix.png") # Save confusion matrix
        mlflow.log_artifact("confusion_matrix.png") #Log it

        # Save the model
        joblib.dump(model, joblib_file)
        mlflow.log_artifact(joblib_file)
        print(f"Saved model to: {joblib_file}")

        return model


####  RUN

In [None]:

# Load preprocessed data (using existing .npy files)
X_train = np.load(Path("../../../data/ML/X_train_ml.npy"))
y_train = np.load(Path("../../../data/ML/y_train_ml.npy"))
X_val = np.load(Path("../../../data/ML/X_val_ml.npy"))
y_val = np.load(Path("../../../data/ML/y_val_ml.npy"))
X_test = np.load(Path("../../../data/ML/X_test_ml.npy"))
y_test = np.load(Path("../../../data/ML/y_test_ml.npy"))

# Combine training and validation data for cross-validation
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)

# Train and evaluate the SVM model
joblib_file = "svm_model.joblib"
trained_model = train_and_evaluate_svm(X, y, X_test, y_test, joblib_file, k_folds=5)

: 