# OutliersDetection

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from collections import defaultdict

#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
#kmeans, dbscan, hierarchical (sklearn)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
#evaluation
from sklearn.metrics import silhouette_score

#distance matrix (dbscan elbow, hierarchical)
from scipy.spatial.distance import pdist, squareform
# hierarchical (scipy)
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import StandardScaler # To transform the dataset
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import kmeans_plusplus
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.cm as cm

In [None]:
X_train = pd.read_csv("X_train.txt", header=None,  delim_whitespace=True)
y_train = pd.read_csv("y_train.txt", header=None, delim_whitespace=True )
X_test = pd.read_csv("X_test.txt", header=None, delim_whitespace=True )
y_test = pd.read_csv("y_test.txt", header=None, delim_whitespace=True )
features = pd.read_csv("features.txt", header=None, delim_whitespace=True )
subject = pd.read_csv("subject_train.txt", header=None, delim_whitespace=True )

In [None]:
feature = features
subjects = subject
feature.drop(0,inplace=True, axis=1)

In [None]:
#inseriamo l'intestazione al dataset
lista=[]
feat_transpa = feature.transpose()
for i in range(561):
    lista.append(feat_transpa.iloc[0][i])
X_test.columns=lista    
X_train.columns=lista

In [None]:
# eliminiamo le colonne che contengono la stima mad in quanto quasi uguale a dev.std
stringa="mad()"
for col in X_train.columns:
    if(stringa in col):
        X_train.drop(labels=col, axis=1, inplace=True)
for col in X_test.columns:
    if(stringa in col):
        X_test.drop(labels=col, axis=1, inplace=True)        

## LOF

In [None]:
from sklearn.neighbors import LocalOutlierFactor
from numpy import quantile, random, where

In [None]:
lof = LocalOutlierFactor(n_neighbors=25)
outliers = lof.fit_predict(X_train)
lofs_index = where(outliers==-1)

In [None]:
np.unique(outliers, return_counts=True)

In [None]:
outliers_score = lof.negative_outlier_factor_
outliers_score

In [None]:
outlier4 = np.where(outliers==-1)

In [None]:
#Creo dataset per inserire outlier di una serie di metodi
X = pd.DataFrame(outliers_score, columns=["LOF_score"])
X["LOF_label"] = outliers

In [None]:
lof.negative_outlier_factor_[np.where(outliers==-1)]

In [None]:
max_val = np.max(lof.negative_outlier_factor_[np.where(outliers==-1)])
max_val

In [None]:
np.min(lof.negative_outlier_factor_[np.where(outliers==-1)])

In [None]:
plt.hist(lof.negative_outlier_factor_, bins=10)
plt.axvline(max_val, c='r')
plt.text(max_val, 250, 'outliers')
plt.show()

## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from numpy import quantile, random, where

In [None]:
contamination=float(.01) #specifichiamo la percentuale di outliers che si presume ci sia

In [None]:
%%time
IF = IsolationForest(n_estimators = 100, random_state=42)
IF.fit(X_train)

In [None]:
outliers = IF.predict(X_train)
np.unique(outliers, return_counts=True)

In [None]:
my_array=IF.decision_function(X_train) #score
my_array2=IF.predict(X_train) #label

In [None]:
#creo un dataset in cui vado ad inserire tutte le anomalie con lo score corrispondente
X["IF_score"] = my_array
X["IF_label"] = my_array2

In [None]:
#se vogliamo printare le anomalie o la riga corrispondente
anomaly=X.loc[X['IF_label']==-1]
anomaly_index=list(anomaly.index)

In [None]:
X_train.iloc[anomaly_index]

## ABOD

In [None]:
#!pip install pyod
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
#from pyod.models.auto_encoder import AutoEncoder

In [None]:
abod = ABOD(n_neighbors=17)
abod.fit(X_train)

In [None]:
abod_score= abod.decision_scores_

In [None]:
outliers = abod.predict(X_train)
np.unique(outliers, return_counts=True)

In [None]:
X["ABOD_score"] = abod_score
X["ABOD_label"] = outliers

In [None]:
plt.hist(abod.decision_scores_, bins=20)
plt.axvline(np.min(abod.decision_scores_[np.where(outliers==1)]), c='k')
plt.show()

## KNN

## Find best K

In [None]:
from sklearn.neighbors import KNeighborsClassifier

error_rate = []
for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train.values.ravel())
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != np.ravel(y_test)))
#print(pred_i)
plt.figure(figsize=(10,6))
plt.plot(range(1,20),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))

In [None]:
clf = KNN(n_neighbors=17, metric='euclidean')
clf.fit(X_train)

In [None]:
outliers = clf.predict(X_train)
np.unique(outliers, return_counts=True)

In [None]:
anomlay_score = clf.decision_function(X_train)

In [None]:
X["KNN_score"]= anomlay_score
X["KNN_label"] = outliers

In [None]:
plt.hist(clf.decision_scores_, bins=20)
plt.axvline(np.min(clf.decision_scores_[np.where(outliers==1)]), c='k')
plt.show()

## Dataset completo degli outliers


In [None]:
#Impostiamo un unica label di -1 outlier e 1 inlier
classe = { 1 : -1 , 0 : 1}
X["ABOD_label"] = X["ABOD_label"].map(classe)
X["KNN_label"] = X["KNN_label"].map(classe)

In [None]:
X

## TOP 10 outliers KNN vs Rest

In [None]:
#KNN
a=X["KNN_score"].tolist()
a.sort(reverse=True)
#a

In [None]:
b=[]   #lista in cui salvo la top 1% di outlier
k=0
for i in a:
    b.append(i)
    k+=1
    if(k==10):
        break
#b  #top 10     

In [None]:
indx1=[]
for i in range(len(X)):
    for j in b:
        if(j==X.iloc[i]["KNN_score"]):
            indx1.append(i)
indx1

In [None]:
X.iloc[indx1].sort_values(by=['KNN_score'],  ascending=False).round(6)    #+ alto è lo score + "outlier" è

## TOP 10 outliers ABOD vs Rest

In [None]:
a=X["ABOD_score"].tolist()
a.sort(reverse=True)
#a

In [None]:
b=[]
k=0
for i in a:
    b.append(i)
    k+=1
    if(k==10):
        break
#b  #top 10     

In [None]:
indx2=[]
for i in range(len(X)):
    for j in b:
        if(j==X.iloc[i]["ABOD_score"]):
            indx2.append(i)
indx2

In [None]:
X.iloc[indx2].sort_values(by=['ABOD_score'],  ascending=False).round(6)   #+ piccolo è lo score + "outlier" è

## TOP 10 outliers IF_score vs Rest

In [None]:
a=X["IF_score"].tolist()
a.sort(reverse=False)
#a

In [None]:
b=[]
k=0
for i in a:
    b.append(i)
    k+=1
    if(k==10):
        break
#b  #top 10     

In [None]:
indx3=[]
for i in range(len(X)):
    for j in b:
        if(j==X.iloc[i]["IF_score"]):
            indx3.append(i)
indx3

In [None]:
X.iloc[indx3].sort_values(by=['IF_score'],  ascending=True).round(6)    #+ piccolo è lo score + "outlier" è

## TOP 10 outliers LOF vs Rest

In [None]:
a=X["LOF_score"].tolist()
a.sort(reverse=False)
#a

In [None]:
b=[]
k=0
for i in a:
    b.append(i)
    k+=1
    if(k==len(lista_lofs_index)):
        break
#b  #top 10   

In [None]:
indx4=[]
for i in range(len(X)):
    for j in b:
        if(j==X.iloc[i]["LOF_score"]):
            indx4.append(i)
indx4

In [None]:
X.iloc[indx4].sort_values(by=['LOF_score'],  ascending=True)    #+ negativi sono gli score + "outlier" è

In [None]:
#TOP 10 OUTLIERS IN COMUNE TRA ABOD, KNN, ISOL_FOR!!!!! oppure guardare i top 10 di un metodo con gli scores degli altri
index1 = indx1
index2 = indx2
index3 = indx3
index4 = indx4

indx1 = set(indx1)    #KNN
indx2 = set(indx2)    #ABOD
indx3 = set(indx3)    #IF _FOREST
indx4 = set(indx4)    #LOF E' quello che si comporta in maniera differente dagli altri, trovando soltanto 9 outlier rispetto ai circa 600 degli altri 3, ed inoltre e' quello che identifica il minor numero di outlier in comune con gli altri. soltano 2 elementi in comune di cui uno con un metodo ed il restante con l'altro metodo. per questo motivo abbiamo deciso di eliminare gli outlier in comune a KNN e isolation forestw

intersect =  indx1 & indx3 & indx4    #i + simili sono indx1 e indx 3 con 7 top 10 outlier in comune, (KNN-Isol_Forest)
intersect
mylist = [x for x in intersect] #converto intersect in lista
mylist

In [None]:
X["Subject"] = subjects.iloc[mylist][0]
X["Activity"] = y_train.iloc[mylist][0]

In [None]:
#Outliers da eliminare
X.iloc[mylist].round(6)

In [None]:
#vediamo a quale attività corrispondono
y_train.iloc[mylist]

In [None]:
#vediamo a quale soggetto corrispondo
subject.iloc[mylist]

## Outliers totali in comune

In [None]:
#Salvo l'indice di tutti gli outlier
anomaly_IF=X.loc[X['IF_label']==-1]
anomaly_IF_index=list(anomaly_IF.index)
anomaly_KNN=X.loc[X['KNN_label']==-1]
anomaly_KNN_index=list(anomaly_KNN.index)
anomaly_LOF=X.loc[X['LOF_label']==-1]
anomaly_LOF_index=list(anomaly_LOF.index)
anomaly_ABOD=X.loc[X['ABOD_label']==-1]
anomaly_ABOD_index=list(anomaly_ABOD.index)

In [None]:
#Intersezione per vedere quali outlier sono in comune tra i metodi
anomaly_IF_index = set(anomaly_IF_index)
anomaly_KNN_index = set(anomaly_KNN_index)
anomaly_LOF_index = set(anomaly_LOF_index)
anomaly_ABOD_index  = set(anomaly_ABOD_index)

intersezione = anomaly_IF_index & anomaly_KNN_index  & anomaly_ABOD_index
len(intersezione)

In [None]:
#OUTLIERS IN COMUNE TRA I DIVERSI METODI TRANNE LOF
len(intersezione)

## Grubbs Test

In [None]:
# https://pypi.org/project/outlier_utils/

In [None]:
import numpy as np
from scipy import stats

In [None]:
data = np.array([5, 14, 15, 15, 14, 19, 17, 16, 20, 22, 8, 21, 28, 11, 9])
alpha = 0.95
n = len(data)

In [None]:
significance_level = alpha / (2*n)  #alpha / n

In [None]:
1-significance_level

In [None]:
t = stats.t.isf(significance_level, n-2, 2)

In [None]:
g_test = ((n-1) / np.sqrt(n)) * (np.sqrt(t**2 / (n-2 + t**2)))

In [None]:
relative_values = abs(data - data.mean())
index = relative_values.argmax()
value = relative_values[index]

In [None]:
index, value

In [None]:
g = value / data.std()

In [None]:
g, g_test

In [None]:
if g > g_test:
    print('outlier')
else:
    print('inlier')

In [None]:
stats.t.isf(0.01, 10)

In [None]:
stats.t.isf(0.01, 100)

In [None]:
stats.t.isf(0.01, 1000)

## PCA for top10 outliers visualization

In [None]:
from sklearn.decomposition import PCA

pca=PCA(n_components=2)
principal_components = pca.fit_transform(X_train)
principal_df = pd.DataFrame(data = principal_components)

print(principal_df.shape)

In [None]:
#estraggo il subset dell'IF per poi plottarlo 
outliers_iso_for = principal_df.iloc[index3]

In [None]:
# plot outlier values of IF
plt.figure(figsize=(8,5))
plt.scatter(principal_df[0], principal_df[1], color = "b", s = 20, edgecolor='black')
plt.scatter(outliers_iso_for[0], outliers_iso_for[1], s=40, color = "r", edgecolor='black')
plt.title("Scatter Plot PCA con Outliers IF", fontsize=14)
plt.xlabel("Principal component 1", fontsize=13)
plt.ylabel("Principal component 2", fontsize=13)
plt.show()

In [None]:
#estraggo il subset del KNN per poi plottarlo 
outliers_KNN = principal_df.iloc[index1]

In [None]:
# plot outlier values of KNN
plt.figure(figsize=(8,5))
plt.scatter(principal_df[0], principal_df[1], color = "b", s = 20, edgecolor='black')
plt.scatter(outliers_KNN[0], outliers_KNN[1], s=40, color = "r", edgecolor='black')
plt.title("Scatter Plot PCA con Outliers KNN", fontsize=14)
plt.xlabel("Principal component 1", fontsize=13)
plt.ylabel("Principal component 2", fontsize=13)
plt.show()

In [None]:
#estraggo il subset del ABOD per poi plottarlo 
outliers_ABOD = principal_df.iloc[index2]

In [None]:
# plot outlier values of ABOD
plt.figure(figsize=(8,5))
plt.scatter(principal_df[0], principal_df[1], color = "b", s = 20, edgecolor='black')
plt.scatter(outliers_ABOD[0], outliers_ABOD[1],s=40, color = "r", edgecolor='black' )
plt.title("Scatter Plot PCA con Outliers ABOD", fontsize=14)
plt.xlabel("Principal component 1", fontsize=13)
plt.ylabel("Principal component 2", fontsize=13)
plt.show()