In [1]:
# Question: Multivariate Outlier Detection Using Mahalanobis Distance
# Description: Implement Mahalanobis distance to detect multivariate outliers in a dataset.



In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2

# Sample dataset: Replace with your dataset
data = {
    'Feature1': [10, 12, 13, 12, 11, 100, 12, 14, 15, 13],
    'Feature2': [20, 19, 21, 20, 18, 190, 19, 22, 23, 21]
}
df = pd.DataFrame(data)

# Step 1: Calculate mean vector and covariance matrix
mean_vec = df.mean().values
cov_matrix = np.cov(df.values, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Step 2: Compute Mahalanobis distance for each observation
def mahalanobis_distance(x, mean, inv_cov):
    diff = x - mean
    md = np.sqrt(np.dot(np.dot(diff.T, inv_cov), diff))
    return md

df['Mahalanobis_Dist'] = df.apply(lambda row: mahalanobis_distance(row.values, mean_vec, inv_cov_matrix), axis=1)

# Step 3: Determine threshold (e.g., 95% confidence interval)
threshold = np.sqrt(chi2.ppf(0.95, df.shape[1]))

# Step 4: Identify outliers
df['Outlier'] = df['Mahalanobis_Dist'] > threshold

print(df)

# Optionally: Show outliers only
print("\nOutliers:")
print(df[df['Outlier']])


   Feature1  Feature2  Mahalanobis_Dist  Outlier
0        10        20          2.526563    False
1        12        19          0.436924    False
2        13        21          0.385427    False
3        12        20          0.438257    False
4        11        18          0.444350    False
5       100       190          2.845165     True
6        12        19          0.436924    False
7        14        22          0.822600    False
8        15        23          1.332823    False
9        13        21          0.385427    False

Outliers:
   Feature1  Feature2  Mahalanobis_Dist  Outlier
5       100       190          2.845165     True
