SML Assignment-2
Sachin Sharma
2021559

Q1
Detect and count outliers using Mahalanobis Distance, LOF and Otsu Thresholding

In [474]:
import numpy as np 
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [475]:
def mahalanobis_dist(X):
    distances = np.zeros(X.shape[0])
    means = np.mean(X, axis=0)
    inv_cov_matrix = np.linalg.inv(np.cov(X, rowvar=False))

    for i in range(X.shape[0]):
        distances[i] = np.sqrt(np.dot((X[i] - means).T, inv_cov_matrix.dot(X[i] - means)))

    return distances

In [476]:
def LOF(X, k=3):
    N, n_features = X.shape

    knn = NearestNeighbors(n_neighbors=k+1)
    knn.fit(X)

    distances, neighbours = knn.kneighbors(X)
    distances = distances[:, 1:]
    neighbours = neighbours[:, 1:]
    k_distances = distances[:, -1]

    reach_distances = np.zeros((N, k)).astype(np.float64)
    for i in range(N):
        for j in range(k):
            reach_distances[i, j] = max(k_distances[neighbours[i][j]], distances[i, j])

    # print(reach_distances)

    lrd = np.zeros(N)
    for i in range(N):
        lrd[i] = k / np.sum(reach_distances[i, :])

    # print(lrd.shape)

    lof = np.zeros(N)
    for i in range(N):
        lof[i] = np.sum([lrd[neighbours[i][j]] for j in range(k)]) / (k * lrd[i])

    return lof

In [477]:
def otsu_thresholding(scores):
    n = len(scores)
    min_intra_class_var = np.inf
    min_t = 0

    for i in range(100):
        t = ((np.max(scores) - np.min(scores)) / 100) * i + np.min(scores)
        inliers, outliers = [], []

        for j in range(n):
            if scores[j] < t:
                inliers.append(scores[j])
            else:
                outliers.append(scores[j])

        prob_inliers = len(inliers) / n
        prob_outliers = len(outliers) / n

        var_inliers = np.var(inliers)
        var_outliers = np.var(outliers)

        intra_class_var = (prob_inliers * var_inliers) + (prob_outliers * var_outliers)

        if intra_class_var < min_intra_class_var:
            min_intra_class_var = intra_class_var
            min_t = t

    return min_t


In [478]:
def get_outliers(X, scores, t):
    n_outliers = 0
    outlier_idx = []
    outliers = []

    for i, s in enumerate(scores):
        if s > t:
            n_outliers += 1
            outliers.append(X[i])

    return n_outliers, outliers

In [479]:
dataset = pd.read_csv(r'glass.xls')

y = pd.DataFrame(dataset, columns=['Type',]).to_numpy()
X = dataset.drop('Type', axis=1).to_numpy()

In [480]:
lof_scores = LOF(X)

In [481]:
m_dist = mahalanobis_dist(X)

In [482]:
opt_t_lof = otsu_thresholding(lof_scores)
n_outliers_lof, outliers_lof = get_outliers(X, lof_scores, opt_t_lof)

print(f'No of outliers detected using LOF: {n_outliers_lof}')
print(f'Outlier points are:\n')

for i in outliers_lof:
    print(i)

No of outliers detected using LOF: 9
Outlier points are:

[ 1.51409 14.25     3.09     2.08    72.28     1.1      7.08     0.
  0.     ]
[ 1.52725 13.8      3.15     0.66    70.57     0.08    11.64     0.
  0.     ]
[ 1.5241 13.83    2.9     1.17   71.15    0.08   10.79    0.      0.    ]
[ 1.51299 14.4      1.74     1.54    74.55     0.       7.59     0.
  0.     ]
[ 1.51115 17.38     0.       0.34    75.41     0.       6.65     0.
  0.     ]
[ 1.51131 13.69     3.2      1.81    72.81     1.76     5.43     1.19
  0.     ]
[ 1.52365 15.79     1.83     1.31    70.43     0.31     8.61     1.68
  0.     ]
[ 1.51653 11.95     0.       1.19    75.18     2.7      8.93     0.
  0.     ]
[ 1.51831 14.39     0.       1.82    72.86     1.41     6.47     2.88
  0.     ]


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [483]:
opt_t_mh = otsu_thresholding(m_dist)
n_outliers_mh, outliers_mh = get_outliers(X, m_dist, opt_t_mh)

print(f'No of outliers detected using Mahalanobis Distance: {n_outliers_mh}')
print(f'Outlier points are:\n')

for i in outliers_mh:
    print(i)

No of outliers detected using Mahalanobis Distance: 29
Outlier points are:

[1.52667e+00 1.39900e+01 3.70000e+00 7.10000e-01 7.15700e+01 2.00000e-02
 9.82000e+00 0.00000e+00 1.00000e-01]
[ 1.51215 12.99     3.47     1.12    72.98     0.62     8.35     0.
  0.31   ]
[ 1.52475 11.45     0.       1.88    72.19     0.81    13.24     0.
  0.34   ]
[ 1.53125 10.73     0.       2.1     69.81     0.58    13.3      3.15
  0.28   ]
[ 1.53393 12.3      0.       1.      70.16     0.12    16.19     0.
  0.24   ]
[ 1.51818 13.72     0.       0.56    74.45     0.      10.99     0.
  0.     ]
[ 1.52664 11.23     0.       0.77    73.21     0.      14.68     0.
  0.     ]
[ 1.52739 11.02     0.       0.75    73.08     0.      14.96     0.
  0.     ]
[1.52777e+00 1.26400e+01 0.00000e+00 6.70000e-01 7.20200e+01 6.00000e-02
 1.44000e+01 0.00000e+00 0.00000e+00]
[ 1.52177 13.2      3.68     1.15    72.75     0.54     8.52     0.
  0.     ]
[ 1.51643 12.16     3.52     1.35    72.89     0.57     8.53     0.
