# Deteksi Outlier Iris

## Koneksi ke dataset

In [56]:
import sys

print("Versi Python yang digunakan:", sys.version)

Versi Python yang digunakan: 3.11.7 (tags/v3.11.7:fa7a6f2, Dec  4 2023, 19:24:49) [MSC v.1937 64 bit (AMD64)]


In [57]:
import nbimporter
from koneksi import load_data

df = load_data()

info = df.info()

# Struktur data (kolom, tipe data, jumlah missing values)
structure = {
    "shape": df.shape,
    "columns": df.dtypes.to_dict(),
    "missing_values": df.isnull().sum().to_dict()
}

# Statistik deskriptif
desc_stats = df.describe(include="all")

# Distribusi kolom numerik
distribution = df.hist(figsize=(10, 8))

import matplotlib.pyplot as plt
plt.tight_layout()
plt.close()  # prevent double plot rendering

structure, desc_stats.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            150 non-null    int64  
 1   species       150 non-null    object 
 2   sepal_length  150 non-null    float64
 3   sepal_width   150 non-null    float64
 4   petal_length  150 non-null    float64
 5   petal_width   150 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


({'shape': (150, 6),
  'columns': {'id': dtype('int64'),
   'species': dtype('O'),
   'sepal_length': dtype('float64'),
   'sepal_width': dtype('float64'),
   'petal_length': dtype('float64'),
   'petal_width': dtype('float64')},
  'missing_values': {'id': 0,
   'species': 0,
   'sepal_length': 0,
   'sepal_width': 0,
   'petal_length': 0,
   'petal_width': 0}},
            id      species  sepal_length  sepal_width  petal_length  \
 count   150.0          150    150.000000      150.000    150.000000   
 unique    NaN            3           NaN          NaN           NaN   
 top       NaN  Iris-setosa           NaN          NaN           NaN   
 freq      NaN           50           NaN          NaN           NaN   
 mean     75.5          NaN      5.843333        3.054      3.758667   
 
         petal_width  
 count    150.000000  
 unique          NaN  
 top             NaN  
 freq            NaN  
 mean       1.198667  )

## Load Model

### ABOD (Angel Based Outlier Detection) 

In [58]:
from pycaret.anomaly import *

exp_name = setup(data = df)
abod = create_model('abod')

Unnamed: 0,Description,Value
0,Session id,5794
1,Original data shape,"(150, 6)"
2,Transformed data shape,"(150, 8)"
3,Numeric features,5
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


### LOF (Local Outlier Factor)

In [59]:
exp_name = setup(data = df)
lof = create_model('lof')

Unnamed: 0,Description,Value
0,Session id,7588
1,Original data shape,"(150, 6)"
2,Transformed data shape,"(150, 8)"
3,Numeric features,5
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


### KNN (k-Nearest Neighbors Detector)

In [60]:
exp_name = setup(data = df)
knn = create_model('knn')

Unnamed: 0,Description,Value
0,Session id,6749
1,Original data shape,"(150, 6)"
2,Transformed data shape,"(150, 8)"
3,Numeric features,5
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


## Implementasi Model ke dataset

In [61]:
# Assign model -> hasilkan dataframe dengan kolom anomaly
df_anomaly_abod = assign_model(abod)
df_anomaly_lof = assign_model(lof)
df_anomaly_knn = assign_model(knn)


## Menampilkan Data dengan skor outlier tertinggi

In [62]:
df_sorted_abod = df_anomaly_abod.sort_values(by="Anomaly_Score", ascending=False).reset_index(drop=True)
print('Top 5 Deteksi Anomali berdasarkan (ABOD)')
df_sorted_abod.head()

Top 5 Deteksi Anomali berdasarkan (ABOD)


Unnamed: 0,id,species,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,107,Iris-virginica,4.9,2.5,4.5,1.7,1,-0.001276
1,123,Iris-virginica,7.7,2.8,6.7,2.0,1,-0.004978
2,119,Iris-virginica,7.7,2.6,6.9,2.3,1,-0.005204
3,118,Iris-virginica,7.7,3.8,6.7,2.2,1,-0.005361
4,120,Iris-virginica,6.0,2.2,5.0,1.5,1,-0.00608


In [63]:
df_sorted_lof = df_anomaly_lof.sort_values(by="Anomaly_Score", ascending=False).reset_index(drop=True)
print('Top 5 Deteksi Anomali berdasarkan (LOF)')
df_sorted_lof.head()

Top 5 Deteksi Anomali berdasarkan (LOF)


Unnamed: 0,id,species,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,1,Iris-setosa,5.1,3.5,1.4,0.2,1,1.256153
1,150,Iris-virginica,5.9,3.0,5.1,1.8,1,1.253521
2,2,Iris-setosa,4.9,3.0,1.4,0.2,1,1.217657
3,149,Iris-virginica,6.2,3.4,5.4,2.3,1,1.214298
4,3,Iris-setosa,4.7,3.2,1.3,0.2,1,1.183005


In [64]:
df_sorted_knn = df_anomaly_knn.sort_values(by="Anomaly_Score", ascending=False).reset_index(drop=True)
print('Top 5 Deteksi Anomali berdasarkan (KNN)')
df_sorted_knn.head()

Top 5 Deteksi Anomali berdasarkan (KNN)


Unnamed: 0,id,species,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,150,Iris-virginica,5.9,3.0,5.1,1.8,1,5.155579
1,1,Iris-setosa,5.1,3.5,1.4,0.2,1,5.037857
2,51,Iris-versicolor,7.0,3.2,4.7,1.4,1,4.397727
3,50,Iris-setosa,5.0,3.3,1.4,0.2,1,4.389761
4,2,Iris-setosa,4.9,3.0,1.4,0.2,1,4.146083


## Menghapus Data dengan 2 skor outlier tertinggi

In [65]:
df_abod_clean = df_sorted_abod.iloc[2:].reset_index(drop=True)
df_abod_clean = df_abod_clean.sort_values(by="id").reset_index(drop=True)
df_abod_clean.head()

Unnamed: 0,id,species,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,1,Iris-setosa,5.1,3.5,1.4,0.2,0,-0.010918
1,2,Iris-setosa,4.9,3.0,1.4,0.2,0,-0.105879
2,3,Iris-setosa,4.7,3.2,1.3,0.2,0,-0.16085
3,4,Iris-setosa,4.6,3.1,1.5,0.2,0,-0.108965
4,5,Iris-setosa,5.0,3.6,1.4,0.2,0,-0.103372


In [66]:
df_lof_clean = df_sorted_lof.iloc[2:].reset_index(drop=True)
df_lof_clean = df_lof_clean.sort_values(by="id").reset_index(drop=True)
df_lof_clean.head()

Unnamed: 0,id,species,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,2,Iris-setosa,4.9,3.0,1.4,0.2,1,1.217657
1,3,Iris-setosa,4.7,3.2,1.3,0.2,1,1.183005
2,4,Iris-setosa,4.6,3.1,1.5,0.2,1,1.153191
3,5,Iris-setosa,5.0,3.6,1.4,0.2,0,1.126259
4,6,Iris-setosa,5.4,3.9,1.7,0.4,0,1.104792


In [67]:
df_knn_clean = df_sorted_knn.iloc[2:].reset_index(drop=True)
df_knn_clean = df_knn_clean.sort_values(by="id").reset_index(drop=True)
df_knn_clean.head()

Unnamed: 0,id,species,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,2,Iris-setosa,4.9,3.0,1.4,0.2,1,4.146083
1,3,Iris-setosa,4.7,3.2,1.3,0.2,0,3.190611
2,4,Iris-setosa,4.6,3.1,1.5,0.2,0,3.018278
3,5,Iris-setosa,5.0,3.6,1.4,0.2,0,3.008322
4,6,Iris-setosa,5.4,3.9,1.7,0.4,0,3.190611


In [68]:
print('Ukuran dataset clean dari metode ABOD:', df_abod_clean.shape)
print('Ukuran dataset clean dari metode LOF:', df_lof_clean.shape)
print('Ukuran dataset clean dari metode KNN:', df_knn_clean.shape)

Ukuran dataset clean dari metode ABOD: (148, 8)
Ukuran dataset clean dari metode LOF: (148, 8)
Ukuran dataset clean dari metode KNN: (148, 8)


Hasil diatas menunjukkan bahwa sudah 2 data yang terhapus 

## Download csv

In [69]:
df_abod_clean.to_csv("results/data_clean_abod.csv", index=False)
df_lof_clean.to_csv("results/data_clean_lof.csv", index=False)
df_knn_clean.to_csv("results/data_clean_knn.csv", index=False)

[📥 Download hasil clean abod (CSV)](data_clean_abod.csv)

[📥 Download hasil clean lof (CSV)](data_clean_lof.csv)

[📥 Download hasil clean knn (CSV)](data_clean_knn.csv)
