# Mahalanobis Distance

## Mahalanobis distance is the distance between two points in a multivariate space. It’s  used in statistical analyses to find outliers that involve serval variables.


## Formula: d(p,q) = √(p1-q1)^2 + (p2-q2)^2

In [1]:
import numpy as np
import scipy as stats
from scipy.stats import chi2

import warnings
warnings.filterwarnings("ignore") 

# yfinance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
symbol = 'AMD'


# Read data 
dataset = yf.download(symbol)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-03-17,0.0,3.302083,3.125,3.145833,3.145833,219600
1980-03-18,0.0,3.125,2.9375,3.03125,3.03125,727200
1980-03-19,0.0,3.083333,3.020833,3.041667,3.041667,295200
1980-03-20,0.0,3.0625,3.010417,3.010417,3.010417,159600
1980-03-21,0.0,3.020833,2.90625,2.916667,2.916667,130800


In [3]:
dataset.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-10-31,60.75,61.860001,59.529999,60.060001,60.060001,73274100
2022-11-01,61.490002,61.990002,59.43,59.66,59.66,83806700
2022-11-02,63.0,63.93,58.57,58.630001,58.630001,142669400
2022-11-03,58.110001,62.279999,58.029999,60.110001,60.110001,95279900
2022-11-04,62.5,63.0,60.540001,62.544998,62.544998,22303441


In [4]:
dataset = dataset.drop(['Adj Close', 'Volume'], axis=1)
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-03-17,0.0,3.302083,3.125,3.145833
1980-03-18,0.0,3.125,2.9375,3.03125
1980-03-19,0.0,3.083333,3.020833,3.041667
1980-03-20,0.0,3.0625,3.010417,3.010417
1980-03-21,0.0,3.020833,2.90625,2.916667


In [5]:
def mahalanobis_distance(x=None, data=None, cov=None):

    x_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(x_mu, inv_covmat)
    mahal = np.dot(left, x_mu.T)
    return mahal.diagonal()

In [6]:
df = mahalanobis_distance(x=dataset, data=dataset)
df

array([ 5.70606561,  4.88591699,  4.97654835, ..., 61.80660152,
       15.45657781,  8.63708589])

In [7]:
dataset = dataset.reset_index(drop=True)

In [8]:
dataset.head()

Unnamed: 0,Open,High,Low,Close
0,0.0,3.302083,3.125,3.145833
1,0.0,3.125,2.9375,3.03125
2,0.0,3.083333,3.020833,3.041667
3,0.0,3.0625,3.010417,3.010417
4,0.0,3.020833,2.90625,2.916667


In [9]:
dataset['mahalanobis'] = mahalanobis_distance(x=dataset, data=dataset[['Open', 'High', 'Low', 'Close']])
dataset.head()

Unnamed: 0,Open,High,Low,Close,mahalanobis
0,0.0,3.302083,3.125,3.145833,5.706066
1,0.0,3.125,2.9375,3.03125,4.885917
2,0.0,3.083333,3.020833,3.041667,4.976548
3,0.0,3.0625,3.010417,3.010417,4.96422
4,0.0,3.020833,2.90625,2.916667,4.756563


In [10]:
dataset['p'] = 1 - chi2.cdf(dataset['mahalanobis'], 4)
dataset.head()

Unnamed: 0,Open,High,Low,Close,mahalanobis,p
0,0.0,3.302083,3.125,3.145833,5.706066,0.222201
1,0.0,3.125,2.9375,3.03125,4.885917,0.299205
2,0.0,3.083333,3.020833,3.041667,4.976548,0.289712
3,0.0,3.0625,3.010417,3.010417,4.96422,0.290988
4,0.0,3.020833,2.90625,2.916667,4.756563,0.3132
