In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [None]:
####################
# DATA PREPARATION #
####################

In [None]:
# Data division and clarification #

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# TO MARK MALIGNANT CASE AS 1 #
df['target'] = 1 - df['target']

X = df.drop('target', axis=1)
Y = df['target']

In [None]:
df.head(5)

In [None]:
malignant_pct = round(df['target'].mean() * 100, 2)
malignant_pct

In [None]:
##############################################
# PERTURB EXISTING POINTS TO CREATE OUTLIERS #
##############################################

In [None]:
# To load and refresh changes#
from utils import outliers_generation_functions as ogf
import importlib
importlib.reload(ogf)

In [None]:
# Custom function to modify random existing point to create outliers

df_modified, idx_modified = ogf.perturb_within_distribution(
    original_data=df,
    pct_to_perturb=25,
    target_column='target',
    feats_to_perturb=10,
    gamma=2.0,
    random_seed=101,
    decimal_places=6
)

In [None]:
# Marking modified rows
df_modified['is_perturbed'] = -1
df_modified.loc[idx_modified, 'is_perturbed'] = 1
df_modified[df_modified['is_perturbed'] == 1]

In [None]:
X_modified = df_modified.drop(columns=['is_perturbed', 'target'])

In [None]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
# Post-perturb data scaling
X_modified_scaled = scaler.fit_transform(X_modified)

# Now 25% is expected to be outliers
lof = LocalOutlierFactor(contamination=0.25, n_neighbors=20)

In [None]:
# To check if new outliers were detected

df_modified['lof_detection'] = lof.fit_predict(X_modified_scaled)

perturbed = df_modified[df_modified['is_perturbed'] == 1].shape[0]
mask = (df_modified['is_perturbed'] == 1) & (df_modified['lof_detection'] == -1)
detected = df_modified[mask].shape[0]

pct_detected = round((detected/perturbed)*100, 2)
print(f"Percentage of perturbed rows detected by LOF: {pct_detected}%")
df_modified[df_modified['lof_detection'] == -1]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# What do new outliers look on graphs?

cols = ['mean perimeter', 'mean concavity', 'mean radius',
        'mean area', 'mean concave points', 'is_perturbed']
plt.figure(figsize=(10, 10), dpi=500)
sns.pairplot(
    data=df_modified[cols],
    hue='is_perturbed',
    palette='viridis')
plt.show()

In [None]:
# Percent variability
percentages = np.arange(1, 51, 1)
f = 4
g = 7.0
s = 101
save = True

for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        feats_to_perturb=f,
        gamma=g,
        random_seed=s,
        decimal_places=6,
        save=save,
        directory_name=f"P[Var]_F[{f}]_G[{g}]_S[{s}]",
        key_word='percent'
    )

In [None]:
# Gamma variability
gammas = np.round(np.arange(0.2, 10.2, 0.2), 1)
f = 15
p = 25
s = 42
save = True

for g in gammas:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        feats_to_perturb=f,
        gamma=g,
        random_seed=s,
        decimal_places=6,
        save=save,
        directory_name=f"G[Var]_F[{f}]_P[{p}]_S[{s}]",
        key_word='gamma'
    )

In [None]:
# Amount of features variability
n_features = np.arange(1, 31, 1)
g = 6.0
p = 25
s = 42
save = True

for f in n_features:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        feats_to_perturb=f,
        gamma=g,
        random_seed=s,
        decimal_places=6,
        save=save,
        directory_name=f"F[Var]_G[{g}]_P[{p}]_S[{s}]",
        key_word='features'
    )

In [None]:
# Selected list of features
percentages = np.arange(1, 51, 1)
f = ['mean radius', 'mean perimeter', 'mean area', 'mean compactness', 'mean concavity',
     'worst radius', 'worst perimeter', 'worst area', 'worst compactness', 'mean concave points']
g = 5.0
s = 101
save = False

for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        feats_to_perturb=f,
        gamma=g,
        random_seed=s,
        decimal_places=6,
        save=save,
        directory_name=f"P[Var]_F[HighCorr{len(f)}]_G[{g}]_S[{s}]",
        key_word='percent'
    )