# Notes
*  Skewed Distribution also makes for large data??

# **Documentation**

## Index - Level 1 API (algos)

### 1. Simple gaussian

The first evaluation metric is Simple univariant gaussian distribution, <br/>
1.  Fuction: fit_gauss(): returns the mean and std for a normal distribution modeled on particular column
2. is_anomaly(col, val, param): returns (flag, plot),
    * col - column name
    * val - value of the column for a row
    * param - dict containing mean and std for that the column 


### 2. Isolation Forest

1. fit_isolation_forest(self, data): applies isolation forest to the data and returns the classifier
    * Isolation forest is applied for the entire data
    * Isolation forest scores are in the range of -0.5 to +0.5, the greater the score the less the anomalous (Opposite of the original paper) 
    

## Index - Level 2 API

### 1. Column level anomaly detection


1. compute_columnar_anomaly(self, data, anomalous_rows ): Disaplays all the anomalous columns for all potential anomalous_rows
    * data - complete data (or partial data)
    * anomalous_rows - rows that the function needs to compute columnar anomaly

#CODE

In [0]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
import cv2
import tensorflow_datasets as tfds
import tensorflow as tf
from os.path import join
from gensim.models.keyedvectors import KeyedVectors
from tqdm.notebook import tqdm as tqdm_notebook
from scipy.stats import norm
import re

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [0]:
# NY yellow taxi data
data = pd.read_csv(join(os.getcwd(),'drive','My Drive', 'datasets', 'table','yellow_tripdata_2019-01.csv'))


In [0]:
def test(**kwargs):
    print('first' in kwargs.keys())
test(second = 10)
data.congestion_surcharge[~np.isnan(data.congestion_surcharge)]

# Modules

In [0]:
def get_img_from_fig(fig, dpi=150):
    buf = io.BytesIO()
    fig.savefig(buf, format="png", dpi=dpi, bbox_inches = 'tight')
    buf.seek(0)
    img_arr = np.frombuffer(buf.getvalue(), dtype=np.uint8)
    buf.close()
    img = cv2.imdecode(img_arr, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img
def print_nplots(plots):
    n = len(plots)
    fig=plt.figure(figsize=(9, 6*n))
    for i in range(len(plots)):
        fig.add_subplot(n,1,i+1)
        plt.imshow(plots[i])
        plt.axis('off')
    plt.show()

In [0]:

def plot_gauss(col = 'column name', mu=998.8, sigma=73.10, sensitivity = 1,val=1120):
    
    z1 = mu - sigma*sensitivity
    z2 = mu + sigma*sensitivity
    x = np.arange(z1, z2, sigma/100) 
    # plot range x1, x2
    if val>mu:
        x2 = max(mu + sigma* max(sensitivity, 4), val)
        x1 = mu - sigma*max(sensitivity, 4)
    else:
        x1 = min(mu - sigma * max(sensitivity, 4), val)
        x2 = mu + sigma*max(sensitivity, 4)
    x_all = np.arange(x1, x2, sigma/100)
    # print(x1,x2)
    # print(x.shape)
    # print(x_all.shape)
    fig, ax = plt.subplots(figsize=(9,6))
    y = norm.pdf(x,mu,sigma)
    # print(y.shape)
    y_all = norm.pdf(x_all,mu,sigma)
    # print(y_all.shape)
    plt.style.use('fivethirtyeight')
    ax.plot(x_all, y_all)
    ax.fill_between(x,y,0, alpha=0.3, color='g')
    ax.fill_between(x_all,y_all,0, alpha=0.1)
    ax.scatter(val, norm.pdf(val,mu,sigma), s=100,c='r',marker = 'o', label = 'Position of Anomaly')
    # ax.set_xlim([-4,4])
    ax.set_xlabel('Gaussian distribution')
    ax.set_yticklabels([])
    ax.set_title(col)
    plt.legend()
    plt.close(fig)
    # plt.savefig('normal_curve.png', dpi=72, bbox_inches='tight')
    # plt.show()
    return get_img_from_fig(fig)

# l = []
# l.append(plot_gauss())
# l.append(plot_gauss(mu=10,sigma=2,val = 3))
# # plot_gauss(mu = 100.0,sigma = 2.0, val=110)
# # plot_gauss()
# plt.figure(figsize=(10,15))
# plt.subplot(211)
# plt.axis('off')
# plt.imshow(l[0])
# plt.subplot(212)
# plt.axis('off')
# plt.imshow(l[1])
# plt.show()

In [0]:
x = [-2.1, -1,  4.3]
y = [3,  1.1,  0.12]
X = np.stack((x, y), axis=0)
X.shape

In [0]:
class ColumnMeta:
    # assigning meta data for columns
    def __init__(self, **kwarg):
        self.name = kwarg['name']
        self.dtype = kwarg['dtype']
        self.gauss_param = dict()
    def summary(self):
        print("__________________________")
        print("Column parameters: ")
        print("Column name  :",self.name)
        print("Data type    :",self.dtype)
        print("Gaussian para:",self.gauss_param)


class NumericOutlier:
    def summary(self, columns):
        for c in columns:
            if np.issubdtype(self.columns[c].dtype, np.number):
                self.columns[c].summary()
    
    def fit_gauss(self,data):
        mean,std=norm.fit(data)
        return {"mean": mean, "std":std, 'sensitivity':2}

    def is_gauss_anomaly(self, col, val, param):
        # print(val)
        if ( abs(val-param['mean']) > param['sensitivity'] * param['std'] ):
            return True, plot_gauss(col, param['mean'], param['std'], param['sensitivity'], val)
        else:
            return False, 0
    
    def multivariate_normal(self, x):
        """pdf of the multivariate normal distribution."""
        d = self.Multivariant_Gauss_param['d']
        mean = self.Multivariant_Gauss_param['mean']
        covariance = self.Multivariant_Gauss_param['cov']
        x_m = x - mean
        return (1. / (np.sqrt((2 * np.pi)**d * np.linalg.det(covariance))) * 
                np.exp(-(np.linalg.solve(covariance, x_m).T.dot(x_m)) / 2))
        
    def fit_isolation_forest(self, data):
        from sklearn.ensemble import IsolationForest
        clf = IsolationForest(random_state=0).fit(data.dropna())
        return clf

    def row_filtering(self, data):
        query_data = data[self.numeric_list].dropna().to_numpy()
        anomaly_score = dict()
        print("computing isolation forest scores....", end='')
        anomaly_score['isolation_forest'] = self.isolation_forest_clf.score_samples(query_data)
        print('completed!')

        print("computing multivariant gaussian scores..")
        probs = []
        for x in tqdm_notebook(query_data, desc = 'rows:'):
            probs.append(self.multivariate_normal(x))
        anomaly_score['multivariant_gauss'] = np.array(probs)
        # calculate net anomaly score
        net_anomaly = np.zeros((query_data.shape[0]), np.float32)
        for key in self.row_level_ensemble_weight.keys():
            # rescale score
            scaled_score = (anomaly_score[key] - anomaly_score[key].min())/(anomaly_score[key].max() - anomaly_score[key].min())
            weighted_score = scaled_score * self.row_level_ensemble_weight[key]
            net_anomaly+=weighted_score
        anomaly_list = np.argsort(net_anomaly)[:int(self.n_percent*net_anomaly.shape[0])]
        # using Multivariant Normal distribution
        return anomaly_list

    def Query(self, data):
        # detect outlier pipline 
        anomaly_list = self.row_filtering(data)
        row_col = self.compute_columnar_anomaly(data, anomaly_list)
        return anomaly_list,row_col
    
    def compute_columnar_anomaly(self, data, anomalous_rows ):
        row_cols_pairs = []
        for row in anomalous_rows:
            anomalous_cols = []
            gauss_plots = []
            for c in self.numeric_list:
                b, plot = self.is_gauss_anomaly(c ,data.loc[row, c] ,self.columns[c].gauss_param)
                if(b): gauss_plots.append(plot)
            if(len(gauss_plots)==0):
                print("No Anomaly found at Row: ",row)
            else:
                print("Anomaly found at following columns for row: ",row)
                row_cols_pairs.append((row, c))
                print_nplots(gauss_plots)
        return row_cols_pairs

    def __init__(self, data, **parameters):
        self.columns = dict()
        self.numeric_list = []
        self.row_level_ensemble_weight = dict()
        self.row_level_ensemble_weight['isolation_forest'] = 0.5
        self.row_level_ensemble_weight['multivariant_gauss'] = 0.5
        # top 1% anomalous data
        self.n_percent = 0.0001
        # parameters for multivariant gauss distribution
        self.Multivariant_Gauss_param = dict()
        for c in data.columns:
            self.columns[c] = ColumnMeta(name = c, dtype = data[c].dtype)
            # print(c)
            if np.issubdtype(self.columns[c].dtype, np.number) and len(re.findall('ID',c))==0:
                col = np.array(data[c])
                self.columns[c].gauss_param = self.fit_gauss(col[~np.isnan(col)])  
                self.numeric_list.append(c)
                # self.columns[c].summary()
        
        print("Running Isolation forest......",end='')
        self.isolation_forest_clf = self.fit_isolation_forest(data[self.numeric_list].dropna())
        print("completed!")
        
        print("Running Mutivariant Gaussian distribution.....",end='')
        self.Multivariant_Gauss_param['d'] = len(self.numeric_list)
        self.Multivariant_Gauss_param['mean'] = data[self.numeric_list].mean().to_numpy()
        self.Multivariant_Gauss_param['cov'] = data[self.numeric_list].cov().to_numpy()
        print("completed!")


### scrap space

TODO
* probability density function to figure out first n% errors 

In [0]:
test_query = data.sample(n=30).reset_index(drop=True)
query_data = pd.DataFrame(test_query, columns = data.columns)
query_data = query_data[detect.numeric_list].dropna().to_numpy()

In [0]:
is_score = detect.isolation_forest_clf.score_samples(query_data)

In [0]:
(is_score[is_score<-0.5].shape[0])/is_score.shape[0]

In [0]:
probs = []
for x in tqdm_notebook(query_data):
    probs.append(detect.multivariate_normal(x))

In [0]:
import seaborn as sns
probs = np.array(probs)
sns.distplot((probs-probs.min())/(probs.max()-probs.min()), hist=False, rug=False)

In [0]:
import seaborn as sns
sns.distplot((is_score-is_score.min())/(is_score.max()-is_score.min()), hist=False, rug=False)

In [0]:
r_probs = (probs-probs.min())/(probs.max()-probs.min())
r_isfor = (is_score-is_score.min())/(is_score.max()-is_score.min())
final_score = r_probs*0.5+r_isfor*0.5
rows_list = np.argsort(final_score)[:int(0.00001*final_score.shape[0])]
rows_list

In [0]:
test_ar = np.array([11, 13, 41, 10, 2, 14, 18, 20])
np.argsort(test_ar)

In [0]:
import seaborn as sns
sns.distplot(probs, hist=False, rug=False);

# TEST Modules

In [0]:
detect = NumericOutlier(data)

In [0]:
detect.summary(data.columns)

In [0]:
data.head()

In [0]:
data.sample(n=10).reset_index(drop=True).head()

In [0]:
test_query = []
test_query.append([1,'2019-01-01 00:46:40', '2019-01-01 00:53:20', 3, 6.5,1,'N',141,154,1,400.0,0.5,0.5,1.65,0.0,0.3,9.95,0])
test_query.append([1,'2019-01-01 00:46:40', '2019-01-01 00:53:20', 3, 6.5,1,'N',141,154,1,800.0,0.5,0.5,1.65,0.0,0.3,9.95,0])
test_query = data.sample(n=10).reset_index(drop=True)
query_data = pd.DataFrame(test_query, columns = data.columns)
a_rows, a_row_cols = detect.Query(data)

# Colorize table

## Manage Sensitivity

In [0]:
#@title Manage Sensitivity of columns
standard_deviation = 1 #@param {type:"slider", min:0, max:4, step:0.1}
column = 'fare_amount' #@param ["VendorID", "passenger_count", "trip_distance", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "congestion_surcharge"] {type:"string"}
detect.columns[column].gauss_param['sensitivity'] = standard_deviation
detect.Query(query_data)