In [1]:
# Question: Multivariate Outlier Detection Using Mahalanobis Distance
# Description: Implement Mahalanobis distance to detect multivariate outliers in a dataset.



In [2]:
import numpy as np
import pandas as pd
from scipy.stats import chi2

# Sample data (multivariate)
data = pd.DataFrame({
    'X1': [10, 12, 14, 15, 10, 9, 12, 11, 50],  # 50 is a potential outlier
    'X2': [20, 22, 19, 21, 23, 22, 21, 20, 5]    # 5 is a potential outlier
})

# Calculate Mahalanobis Distance for each point
def mahalanobis_distance(x, data):
    mean_vec = np.mean(data, axis=0)
    cov_mat = np.cov(data, rowvar=False)
    inv_covmat = np.linalg.inv(cov_mat)
    diff = x - mean_vec
    md = np.sqrt(np.dot(np.dot(diff, inv_covmat), diff.T))
    return md

# Apply function to all rows
data_values = data.values
md_values = np.array([mahalanobis_distance(x, data_values) for x in data_values])

data['Mahalanobis_Distance'] = md_values

# Determine cutoff threshold from Chi-square distribution (df = number of features)
threshold = chi2.ppf((1 - 0.01), df=data.shape[1])  # 99% confidence level

data['Outlier'] = data['Mahalanobis_Distance'] > np.sqrt(threshold)

print(data)


   X1  X2  Mahalanobis_Distance  Outlier
0  10  20              1.516564    False
1  12  22              1.063640    False
2  14  19              0.887689    False
3  15  21              1.232332    False
4  10  23              1.256400    False
5   9  22              0.536149    False
6  12  21              0.334941    False
7  11  20              1.149625    False
8  50   5              2.638608    False
