In [2]:
# Question: Multivariate Outlier Detection Using Mahalanobis Distance
# Description: Implement Mahalanobis distance to detect multivariate outliers in a dataset.

import numpy as np
import pandas as pd
from scipy.stats import chi2

# Sample 2D data (could be more dimensions)
data = {
    'X1': [2, 3, 4, 5, 6, 7, 50],  # 50 is an outlier
    'X2': [8, 7, 6, 5, 4, 3, 1]
}
df = pd.DataFrame(data)

# Calculate mean vector and covariance matrix
mean_vec = df.mean().values
cov_matrix = np.cov(df.values, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Compute Mahalanobis distance for each point
def mahalanobis_distance(row, mean, inv_cov):
    diff = row - mean
    return np.sqrt(diff.T @ inv_cov @ diff)

df['Mahalanobis_D'] = df.apply(lambda row: mahalanobis_distance(row.values, mean_vec, inv_cov_matrix), axis=1)

# Set threshold based on Chi-square distribution (p=0.99)
threshold = np.sqrt(chi2.ppf((0.99), df=df.shape[1]))

# Detect outliers
df['Outlier'] = df['Mahalanobis_D'] > threshold

print(df)

   X1  X2  Mahalanobis_D  Outlier
0   2   8       1.511858    False
1   3   7       0.956183    False
2   4   6       0.478091    False
3   5   5       0.478091    False
4   6   4       0.956183    False
5   7   3       1.511858    False
6  50   1       2.267787    False
