In [17]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [18]:
####################
# DATA PREPARATION #
####################

In [19]:
# DATA DIVISION AND CLARIFICATION #

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

X = df.drop('target', axis=1)
Y = df['target']

In [20]:
df.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
# DATA SCALING #

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
##############################################
# PERTURB EXISTING POINTS TO CREATE OUTLIERS #
##############################################

In [24]:
# TO LOAD AND REFRESH CHANGES #

from utils import outliers_generation_functions as ogf
import importlib
importlib.reload(ogf)

<module 'utils.outliers_generation_functions' from 'C:\\Users\\Artur\\Desktop\\PythonProject\\utils\\outliers_generation_functions.py'>

In [25]:
# CUSTOM FUNCTION TO MODIFY RANDOM EXISTING POINTS TO CREATE OUTLIERS #

perturbed_example, idx_modified = ogf.perturb_within_distribution(
    original_data=df,
    pct_to_perturb=20,
    target_column='target',
    features_to_perturb=15,
    gamma=3.0,
    random_state=101,
    decimal_places=6
)

In [26]:
# MARKING ROW MODIFICATIONS #

perturbed_example['is_perturbed'] = -1
perturbed_example.loc[idx_modified, 'is_perturbed'] = 1
perturbed_example[perturbed_example['is_perturbed'] == 1]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,is_perturbed
17,16.130000,20.680000,108.100000,-810.382138,0.117000,0.202200,0.385731,0.102800,0.198143,0.051829,...,136.800000,1315.000000,0.245732,0.423300,0.478400,0.375961,0.385580,0.114200,0,1
24,16.650000,21.380000,110.000000,-140.509208,0.112100,-0.007330,0.152500,0.197423,0.199500,0.056535,...,177.000000,2215.000000,0.125785,0.357800,1.150589,0.160732,0.361300,0.095640,0,1
26,6.706950,21.530000,97.410000,644.800000,0.091537,0.186800,-0.391272,0.076588,0.225200,0.069240,...,164.430315,-1217.660852,0.112392,0.217429,0.230247,0.270100,0.198220,0.127500,0,1
27,18.610000,20.250000,122.100000,1918.093325,0.094400,0.145934,0.149000,0.077310,0.288829,0.056990,...,139.900000,802.593525,0.133800,-0.008603,0.344600,0.149000,0.061726,0.074210,0,1
43,38.765658,20.280000,25.276580,545.200000,0.104100,0.223657,0.274699,-0.055416,0.197400,0.067820,...,93.704550,912.263294,0.153000,-0.178216,0.366400,0.149200,0.373900,0.102700,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,25.090879,16.350000,27.515998,1074.781556,0.094340,0.049940,0.084120,0.005495,0.254433,0.098180,...,71.120000,384.900000,0.128500,-1.184352,0.043840,0.056307,0.376024,0.073990,1,1
554,14.778418,28.333515,162.202499,-316.420622,0.096108,0.058240,0.158757,0.023430,0.258794,0.051507,...,88.840000,595.700000,0.122700,0.162000,0.384573,0.064930,-0.165101,0.072420,1,1
562,15.220000,30.620000,161.541110,-493.431978,0.090658,0.101568,0.255000,-0.093999,0.158461,0.086989,...,175.076495,3416.946142,0.046535,0.634054,1.170000,0.296600,0.408900,0.140900,0,1
563,29.599911,25.861464,143.000000,958.220939,0.143675,0.223600,0.317400,0.147400,0.279542,0.092072,...,312.176748,1819.000000,0.085159,0.418600,1.455224,0.254200,0.292900,0.098730,0,1


In [27]:
example_X = perturbed_example.drop(columns=['is_perturbed', 'target'])

In [28]:
from sklearn.neighbors import LocalOutlierFactor

In [29]:
# POST-PERTURB DATA SCALING #
example_X_scaled = scaler.fit_transform(example_X)

# NOW 20% IS EXPECTED TO BE OUTLIERS #
lof = LocalOutlierFactor(contamination=0.20, n_neighbors=20)

In [30]:
# TO CHECK IF NEW OUTLIERS ARE DETECTED #

perturbed_example['outliers_detection'] = lof.fit_predict(example_X_scaled)

perturbed = perturbed_example[perturbed_example['is_perturbed'] == 1].shape[0]
mask = (perturbed_example['is_perturbed'] == 1) & (perturbed_example['outliers_detection'] == -1)
detected = perturbed_example[mask].shape[0]

pct_detected = round((detected/perturbed)*100, 2)
print(f"Percentage of perturbed rows detected by LOF: {pct_detected}%")

Percentage of perturbed rows detected by LOF: 97.37%


In [31]:
################################################################
# GENERATE MANY FILES WITH ROWS PERTURBATIONS, AND SAVE TO CSV #
################################################################

In [32]:
percentages = np.arange(1, 51, 1)

for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        features_to_perturb=15,
        gamma=6.0,
        random_state=101,
        decimal_places=6,
        save=True,
        directory_name='BCW_1to50pct_15of30_g6',
        key_word='perturbed'
    )