In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits

In [7]:
####################
# DATA PREPARATION #
####################

In [10]:
# DATA DIVISION AND CLARIFICATION #

data = load_digits()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

X = df.drop('target', axis=1)
Y = df['target']

In [11]:
df.head(5)

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler()

In [15]:
##############################################
# PERTURB EXISTING POINTS TO CREATE OUTLIERS #
##############################################

In [16]:
# TO LOAD AND REFRESH CHANGES #

from utils import outliers_generation_functions as ogf
import importlib
importlib.reload(ogf)

<module 'utils.outliers_generation_functions' from 'C:\\Users\\Artur\\Desktop\\PythonProject\\utils\\outliers_generation_functions.py'>

In [24]:
# CUSTOM FUNCTION TO MODIFY RANDOM EXISTING POINTS TO CREATE OUTLIERS #

df_modified, idx_modified = ogf.perturb_within_distribution(
    original_data=df,
    pct_to_perturb=25,
    target_column='target',
    feats_to_perturb=16,
    gamma=2.0,
    random_seed=101,
    decimal_places=1
)

In [25]:
# MARKING ROW MODIFICATIONS #

df_modified['is_perturbed'] = -1
df_modified.loc[idx_modified, 'is_perturbed'] = 1
df_modified[df_modified['is_perturbed'] == 1]

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target,is_perturbed
1,0.0,0.0,9.3,12.0,17.4,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-2.2,9.2,16.0,10.0,2.1,1.5,1,1
7,0.0,2.3,23.2,-1.6,13.0,16.0,15.0,0.9,0.0,0.0,...,0.0,0.0,13.0,5.0,0.0,0.0,0.0,0.0,7,1
18,0.0,-0.2,10.0,7.0,13.0,9.0,0.0,0.0,0.2,-7.1,...,0.0,0.5,11.0,14.0,5.0,5.8,0.0,0.0,8,1
19,0.0,0.0,6.0,14.0,4.0,0.4,0.0,-2.7,0.0,0.0,...,0.0,0.0,7.0,16.0,16.0,24.8,11.0,1.0,9,1
20,0.0,-5.2,3.0,12.6,10.9,-6.4,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,12.0,13.0,4.0,-4.1,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1778,0.0,0.0,0.0,9.1,4.9,8.0,0.0,0.0,0.0,-6.3,...,0.0,0.0,0.0,0.0,15.0,0.5,0.0,0.0,4,1
1779,0.0,0.0,19.7,13.3,16.0,16.0,4.0,-0.7,0.0,0.0,...,0.0,-1.1,3.0,12.0,0.0,0.0,-3.7,0.0,7,1
1783,0.0,-2.9,15.0,13.0,1.0,0.0,-0.4,0.0,0.0,0.0,...,0.0,0.0,8.0,13.0,10.0,6.0,2.0,0.0,2,1
1785,0.0,1.1,10.0,16.0,19.3,2.0,0.0,-2.7,-0.1,1.0,...,0.0,0.4,10.0,15.0,2.0,0.0,0.0,0.0,7,1


In [26]:
X_modified = df_modified.drop(columns=['is_perturbed', 'target'])

In [27]:
from sklearn.neighbors import LocalOutlierFactor

In [28]:
# POST-PERTURB DATA SCALING #
X_modified_scaled = scaler.fit_transform(X_modified)

# NOW 20% IS EXPECTED TO BE OUTLIERS #
lof = LocalOutlierFactor(contamination=0.25, n_neighbors=20)

In [29]:
# TO CHECK IF NEW OUTLIERS ARE DETECTED #

df_modified['lof_defection'] = lof.fit_predict(X_modified_scaled)

perturbed = df_modified[df_modified['is_perturbed'] == 1].shape[0]
mask = (df_modified['is_perturbed'] == 1) & (df_modified['lof_defection'] == -1)
detected = df_modified[mask].shape[0]

pct_detected = round((detected/perturbed)*100, 2)
print(f"Percentage of perturbed rows detected by LOF: {pct_detected}%")

Percentage of perturbed rows detected by LOF: 91.09%


In [30]:
import seaborn as sns
import matplotlib.pyplot as plt

In [111]:
################################################################
# GENERATE MANY FILES WITH ROWS PERTURBATIONS, AND SAVE TO CSV #
################################################################

In [33]:
percentages = np.arange(1, 51, 1)

for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        feats_to_perturb=16,
        gamma=2.0,
        random_seed=101,
        decimal_places=1,
        save=True,
        directory_name='D_1to50pct_16of64_g2.0',
        key_word='perturbed'
    )