In [144]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [145]:
####################
# DATA PREPARATION #
####################

In [146]:
# DATA DIVISION AND CLARIFICATION #

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

X = df.drop('target', axis=1)
Y = df['target']

In [147]:
df.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [148]:
from sklearn.preprocessing import StandardScaler

In [149]:
# DATA SCALING #

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [150]:
##############################################
# PERTURB EXISTING POINTS TO CREATE OUTLIERS #
##############################################

In [151]:
# TO LOAD AND REFRESH CHANGES #

from utils import outliers_generation_functions as ogf
import importlib
importlib.reload(ogf)

<module 'utils.outliers_generation_functions' from 'C:\\Users\\Artur\\Desktop\\PythonProject\\utils\\outliers_generation_functions.py'>

In [152]:
# CUSTOM FUNCTION TO MODIFY RANDOM EXISTING POINTS TO CREATE OUTLIERS #

perturbed_example, idx_modified = ogf.perturb_within_distribution(
    original_data=df,
    pct_to_perturb=15,
    target_column='target',
    features_to_perturb=15,
    gamma=3.0,
    random_state=101,
    decimal_places=6
)

In [153]:
# MARKING ROW MODIFICATIONS #

perturbed_example['is_perturbed'] = -1
perturbed_example.loc[idx_modified, 'is_perturbed'] = 1
perturbed_example[perturbed_example['is_perturbed'] == 1]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,is_perturbed
4,26.674524,14.340000,201.033160,138.641402,0.012375,0.132800,0.198000,0.104300,0.161516,0.038934,...,152.200000,3618.770072,0.168667,0.205000,1.064886,-0.001193,0.232982,0.076780,0,1
5,8.686395,15.700000,171.196134,477.100000,0.108332,0.170000,0.040321,0.080890,0.208700,0.076130,...,113.392388,741.600000,0.150690,-0.230047,0.535500,0.174100,0.398500,0.124400,0,1
6,36.968968,36.927692,119.600000,1989.429433,0.089730,0.264767,0.323141,-0.046416,0.179400,0.057420,...,265.753889,-278.796177,0.144200,-0.375322,0.378400,0.327162,0.306300,0.083680,0,1
7,13.710000,20.830000,99.029383,577.900000,0.084733,0.021552,0.107590,0.059850,0.276457,0.075712,...,200.771898,897.000000,0.125448,0.368200,-0.110669,0.353702,0.319600,0.201745,0,1
9,4.586950,24.040000,83.970000,916.076865,0.118600,0.146827,0.036984,0.085430,0.203000,0.082430,...,64.528669,711.400000,0.185300,1.058000,1.105000,0.034100,0.022360,0.207500,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,4.769372,16.700000,82.510000,-541.015198,0.112500,-0.079640,0.168588,0.029950,0.246438,0.066230,...,243.362490,1354.448780,0.175960,0.197900,0.142300,0.080450,0.307100,0.074427,1,1
532,15.048944,14.931411,-77.378001,479.085577,0.092770,0.072550,0.017520,0.018800,0.004771,0.061550,...,101.600000,1194.200717,0.126400,-0.418244,0.120600,0.087040,0.158238,0.050270,1,1
546,0.274546,34.063934,-43.018318,324.900000,0.137828,0.168360,0.010120,0.005495,0.188500,0.062010,...,-50.157204,384.900000,0.061724,-0.224579,0.043840,0.023810,0.291086,0.050526,1,1
561,4.728287,29.370000,70.670000,386.000000,0.074490,-0.176383,0.000000,0.000000,0.150633,0.055020,...,75.190000,-1839.148820,0.138320,-0.884409,0.188211,-0.080512,0.386716,0.059050,1,1


In [154]:
example_X = perturbed_example.drop(columns=['is_perturbed', 'target'])

In [155]:
from sklearn.neighbors import LocalOutlierFactor

In [156]:
# POST-PERTURB DATA SCALING #
example_X_scaled = scaler.fit_transform(example_X)

# NOW 15% IS EXPECTED TO BE OUTLIERS #
lof = LocalOutlierFactor(contamination=0.15, n_neighbors=20)

In [159]:
# TO CHECK IF NEW OUTLIERS ARE DETECTED #

perturbed_example['outliers_detection'] = lof.fit_predict(example_X_scaled)
detected_1 = perturbed_example[perturbed_example['outliers_detection'] == -1].shape[0]

perturbed = perturbed_example[perturbed_example['is_perturbed'] == 1].shape[0]
mask = (perturbed_example['is_perturbed'] == 1) & (perturbed_example['outliers_detection'] == -1)
detected = perturbed_example[mask].shape[0]

pct_detected = round((detected/perturbed)*100, 2)
pct_detected

96.47

In [160]:
################################################################
# GENERATE MANY FILES WITH ROWS PERTURBATIONS, AND SAVE TO CSV #
################################################################

In [161]:
percentages = np.arange(1, 51, 1)

for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        features_to_perturb=15,
        gamma=6.0,
        random_state=101,
        decimal_places=6,
        save=True,
        directory_name='BCW_1to50pct_15of30_g6',
        key_word='perturbed'
    )