In [29]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import LocalOutlierFactor
from utils import outliers_generation_functions as ogf
# from utils import perturb_within_distribution

In [30]:
import importlib
importlib.reload(ogf)

<module 'utils.outliers_generation_functions' from 'C:\\Users\\Artur\\Desktop\\MasterThesisProject\\utils\\outliers_generation_functions.py'>

In [31]:
####################
# DATA PREPARATION #
####################

In [32]:
# DATA DIVISION AND CLARIFICATION #
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

X = df.drop('target', axis=1)
Y = df['target']

In [43]:
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [34]:
# DATA SCALING #
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [35]:
##############################################
# PERTURB EXISTING POINTS TO CREATE OUTLIERS #
##############################################

In [36]:
# CUSTOM FUNCTION TO MODIFY RANDOM EXISTING POINTS TO CREATE OUTLIERS #
perturbated_example, idx_modified = ogf.perturb_within_distribution(
    original_data=df,
    pct_to_perturb=20,
    target_column='target',
    sigma=3.0,
    random_state=101,
    decimal_places=6,
    negative_values=False
)

In [37]:
# MARKING ROW MODIFICATIONS #
perturbated_example['is_perturbated'] = -1
to_mark = perturbated_example.index[idx_modified]
perturbated_example.loc[to_mark, 'is_perturbated'] = 1
perturbated_example[perturbated_example['is_perturbated'] == 1]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,is_perturbated
18,19.810000,22.150000,130.000000,1260.000000,0.098310,0.102700,0.147900,0.094980,0.158200,0.053950,...,186.800000,2398.000000,0.151200,0.315000,0.537200,0.238800,0.276800,0.076150,0,1
24,16.650000,21.380000,110.000000,904.600000,0.117988,0.145700,0.256063,0.091700,0.122731,0.063300,...,177.000000,5862.692289,0.180500,0.357800,0.000000,0.395731,0.361300,0.095640,0,1
26,14.580000,30.225226,88.908296,1779.246829,0.105400,0.186800,0.142500,0.000000,0.225200,0.068379,...,122.400000,896.900000,0.152500,1.067147,0.553900,0.270100,0.426400,0.127500,0,1
27,19.537046,8.458852,53.375867,1966.092303,0.094400,0.106600,0.000000,0.077310,0.124517,0.066485,...,139.744677,1129.512320,0.081257,0.586269,1.022662,0.353268,0.031012,0.118342,0,1
44,4.047851,9.526718,45.381637,69.649451,0.080337,0.104700,0.158027,0.052520,0.188967,0.060276,...,10.600896,1094.824011,0.085545,0.494116,1.278365,0.404131,0.369300,0.088326,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,11.394247,15.401639,75.087222,22.868427,0.088770,0.080660,0.043580,0.024380,0.166900,0.068459,...,71.080000,35.571729,0.234878,0.902518,0.000000,0.000000,0.118123,0.041585,1,1
555,0.514006,27.610000,65.670000,321.400000,0.090300,0.076580,0.059990,0.027380,0.159300,0.061270,...,69.570000,357.600000,0.138400,0.171000,0.200000,0.091270,0.200268,0.082830,1,1
563,20.920000,25.090000,143.000000,1347.000000,0.109900,0.223600,0.317400,0.147400,0.214900,0.068790,...,179.100000,1819.000000,0.140700,0.418600,0.659900,0.254200,0.292900,0.098730,0,1
564,19.819760,1.368121,176.257249,1479.000000,0.111000,0.070078,0.243900,0.077806,0.172600,0.056230,...,79.476425,1709.195820,0.141000,0.886663,0.000000,0.221600,0.443325,0.071150,0,1


In [38]:
features = perturbated_example.drop(columns=['is_perturbated'])

In [39]:
# POST-PERTURB DATA SCALING #
perturbated_example_scaled = scaler.fit_transform(features)

# NOW 20% IS EXPECTED TO BE OUTLIERS #
lof_2 = LocalOutlierFactor(contamination=0.2, n_neighbors=10)

In [40]:
# TO CHECK IF NEW OUTLIERS ARE DETECTED #
perturbated_example['outliers_detection'] = lof_2.fit_predict(perturbated_example_scaled)
perturbated_example[perturbated_example['is_perturbated'] == 1]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,is_perturbated,outliers_detection
18,19.810000,22.150000,130.000000,1260.000000,0.098310,0.102700,0.147900,0.094980,0.158200,0.053950,...,2398.000000,0.151200,0.315000,0.537200,0.238800,0.276800,0.076150,0,1,1
24,16.650000,21.380000,110.000000,904.600000,0.117988,0.145700,0.256063,0.091700,0.122731,0.063300,...,5862.692289,0.180500,0.357800,0.000000,0.395731,0.361300,0.095640,0,1,-1
26,14.580000,30.225226,88.908296,1779.246829,0.105400,0.186800,0.142500,0.000000,0.225200,0.068379,...,896.900000,0.152500,1.067147,0.553900,0.270100,0.426400,0.127500,0,1,-1
27,19.537046,8.458852,53.375867,1966.092303,0.094400,0.106600,0.000000,0.077310,0.124517,0.066485,...,1129.512320,0.081257,0.586269,1.022662,0.353268,0.031012,0.118342,0,1,-1
44,4.047851,9.526718,45.381637,69.649451,0.080337,0.104700,0.158027,0.052520,0.188967,0.060276,...,1094.824011,0.085545,0.494116,1.278365,0.404131,0.369300,0.088326,0,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,11.394247,15.401639,75.087222,22.868427,0.088770,0.080660,0.043580,0.024380,0.166900,0.068459,...,35.571729,0.234878,0.902518,0.000000,0.000000,0.118123,0.041585,1,1,-1
555,0.514006,27.610000,65.670000,321.400000,0.090300,0.076580,0.059990,0.027380,0.159300,0.061270,...,357.600000,0.138400,0.171000,0.200000,0.091270,0.200268,0.082830,1,1,1
563,20.920000,25.090000,143.000000,1347.000000,0.109900,0.223600,0.317400,0.147400,0.214900,0.068790,...,1819.000000,0.140700,0.418600,0.659900,0.254200,0.292900,0.098730,0,1,1
564,19.819760,1.368121,176.257249,1479.000000,0.111000,0.070078,0.243900,0.077806,0.172600,0.056230,...,1709.195820,0.141000,0.886663,0.000000,0.221600,0.443325,0.071150,0,1,-1


In [45]:
################################################################
# GENERATE MANY FILES WITH ROWS PERTURBATIONS, AND SAVE TO CSV #
################################################################

percentages = np.arange(1, 51, 1)

for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        sigma=6.0,
        random_state=101,
        decimal_places=6,
        negative_values=True,
        save=True,
        directory_name='breast_cancer_1to50pct_random_random_s6',
        key_word='breast_cancer'
    )

In [47]:
for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        features_to_perturb=30,
        sigma=6.0,
        random_state=101,
        decimal_places=6,
        negative_values=True,
        save=True,
        directory_name='breast_cancer_1to50pct_30_s6',
        key_word='breast_cancer'
    )