In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [None]:
# Data division and clarification #

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
# TO MARK MALIGNANT CASE AS 1 #
df['target'] = 1 - df['target']

X = df.drop('target', axis=1)
Y = df['target']

In [None]:
from utils import outliers_generation_functions as ogf
from utils import helper_functions as hf
import importlib
importlib.reload(ogf)
importlib.reload(hf)

In [None]:
# Post-perturb data scaling #
from sklearn.preprocessing import StandardScaler
# Percentage of new outliers detected #
from sklearn.neighbors import LocalOutlierFactor

In [None]:
gammas = [2, 3, 4, 6, 8, 10, 12]
n_features = [3, 5, 7, 9, 11, 12, 15]
seed = 101
pct = 25

In [None]:
rows = []
for g in gammas:
    for n in n_features:

        df_modified, idx_modified = ogf.perturb_within_distribution(
            original_data=df,
            pct_to_perturb=pct,
            target_column='target',
            feats_to_perturb=n,
            gamma=g,
            random_seed=seed,
            decimal_places=6)

        df_modified['is_perturbed'] = -1
        df_modified.loc[idx_modified, 'is_perturbed'] = 1

        X_modified = df_modified.drop(columns=['is_perturbed', 'target'])
        scaler = StandardScaler()
        X_modified_scaled = scaler.fit_transform(X_modified)

        lof = LocalOutlierFactor(contamination=pct*0.01, n_neighbors=20)
        # Mark detected outliers.
        df_modified['lof_detection'] = lof.fit_predict(X_modified_scaled)
        # Add information about the strength of outliers.
        # A smaller number indicates a stronger outlier.
        df_modified['lof_score'] = lof.negative_outlier_factor_
        mask = df_modified['lof_detection'] == -1
        # Sort outliers by lof_score ascending to check correlation later.
        outliers_sorted = df_modified[mask].sort_values('lof_score', ascending=True)

        ranked_indices = outliers_sorted.index.tolist()

        perturbed = df_modified[df_modified['is_perturbed'] == 1].shape[0]
        mask = (df_modified['is_perturbed'] == 1) & (df_modified['lof_detection'] == -1)
        detected = df_modified[mask].shape[0]

        detection = round((detected/perturbed)*100, 2)

        rows.append({
            'gamma': g,
            'n_features': n,
            'ranked_indices': ranked_indices,
            'detection': detection,
            'seed' : seed,
        })

parameters_df = pd.DataFrame(rows, columns=['gamma', 'n_features', 'ranked_indices','detection', 'seed'])

In [None]:
parameters_df

In [None]:
target_detection = 97.00
tolerance = 1.00

selected = parameters_df[(parameters_df['detection'] >= (target_detection - tolerance)) & (parameters_df['detection'] <= (target_detection + tolerance))]
selected

In [None]:
g_1 = 3
n_1= 15

g_2 = 6
n_2 = 5

r1 = parameters_df.loc[(parameters_df['gamma']==g_1) & (parameters_df['n_features']==n_1), 'ranked_indices'].iloc[0]
r2 = parameters_df.loc[(parameters_df['gamma']==g_2) & (parameters_df['n_features']==n_2), 'ranked_indices'].iloc[0]

In [None]:
spearman_stats = {
    'g_1': g_1,
    'n_1': n_1,
    'g_2': g_2,
    'n_2': n_2,
    **hf.spearman_with_bootstrap(r1, r2)
}
spearman_stats

In [None]:
spearman_stats = pd.DataFrame([spearman_stats])
spearman_stats

In [None]:
import os

In [None]:
filepath = f"../../data/spearman_results.csv"

if not os.path.exists(filepath):
    spearman_stats.to_csv(filepath, index=False)
else:
    spearman_stats.to_csv(filepath, mode='a', header=False, index=False)