In [1]:
import numpy as np
from scipy.stats import chi2

def mahalanobis_outliers(data, threshold=0.01):
    """
    Detect multivariate outliers using Mahalanobis distance.
    
    Parameters:
    - data: list or numpy array of shape (n_samples, n_features)
    - threshold: significance level for chi-square distribution (default 0.01)
    
    Returns:
    - List of indices of detected outliers
    """
    data = np.array(data)
    mean_vec = np.mean(data, axis=0)
    cov_matrix = np.cov(data, rowvar=False)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    
    # Compute Mahalanobis distance for each point
    distances = []
    for i in range(len(data)):
        diff = data[i] - mean_vec
        md = np.sqrt(diff.T @ inv_cov_matrix @ diff)
        distances.append(md)
    distances = np.array(distances)
    
    # Determine cutoff distance from Chi-square distribution
    dof = data.shape[1]  # degrees of freedom = number of features
    cutoff = np.sqrt(chi2.ppf(1 - threshold, dof))
    
    # Identify outliers
    outlier_indices = np.where(distances > cutoff)[0]
    return outlier_indices.tolist()

# Example usage
dataset = [
    [2, 3],
    [3, 4],
    [4, 5],
    [5, 6],
    [100, 100]  # obvious outlier
]

outliers = mahalanobis_outliers(dataset, threshold=0.01)
print("Outlier indices:", outliers)


Outlier indices: []
