In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [20]:
# Load the dataset
df = pd.read_csv('task_3.csv', index_col=0)

# Extract samples and labels
samples_df = df.iloc[:, :59].T  # Samples as rows, ASVs as columns
labels = samples_df.index.str.split('_').str[0]  # Environment labels (NG, NZ, XZ)

In [21]:
# Initialize Stratified K-Fold for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store metrics for each fold
accuracy_scores = []
class_reports = []

In [22]:
# Cross-validation loop
for train_idx, test_idx in skf.split(samples_df, labels):
    # Split data into training and validation sets
    X_train, X_test = samples_df.iloc[train_idx], samples_df.iloc[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Standardize features (fit on training data only)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Store results
    accuracy_scores.append(accuracy)
    class_reports.append(report)

In [23]:
# Aggregate metrics
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

In [24]:
# Print performance summary
print(f"Cross-Validation Results ({n_splits}-fold):")
print(f"Mean Accuracy: {mean_accuracy:.3f} ± {std_accuracy:.3f}\n")

Cross-Validation Results (5-fold):
Mean Accuracy: 0.983 ± 0.033



In [25]:
# Detailed class-wise metrics
avg_precision = np.mean([r['macro avg']['precision'] for r in class_reports])
avg_recall = np.mean([r['macro avg']['recall'] for r in class_reports])
avg_f1 = np.mean([r['macro avg']['f1-score'] for r in class_reports])

print(f"Macro-Averaged Metrics:")
print(f"Precision: {avg_precision:.3f}, Recall: {avg_recall:.3f}, F1-Score: {avg_f1:.3f}")

# Example classification report for the last fold
print("\nExample Classification Report (Last Fold):")
print(classification_report(y_test, y_pred))

Macro-Averaged Metrics:
Precision: 0.989, Recall: 0.967, F1-Score: 0.972

Example Classification Report (Last Fold):
              precision    recall  f1-score   support

          NG       1.00      1.00      1.00         1
          NZ       1.00      1.00      1.00         5
          XZ       1.00      1.00      1.00         5

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

