In [97]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [98]:
# DATA DIVISION AND CLARIFICATION #

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
# TO MARK MALIGNANT CASE AS 1 #
df['target'] = 1 - df['target']

X = df.drop('target', axis=1)
Y = df['target']

In [99]:
from utils import outliers_generation_functions as ogf
from utils import helper_functions as hf
import importlib
importlib.reload(ogf)
importlib.reload(hf)

<module 'utils.helper_functions' from 'C:\\Users\\Artur\\Desktop\\PythonProject\\utils\\helper_functions.py'>

In [100]:
# POST-PERTURB DATA SCALING #
from sklearn.preprocessing import StandardScaler
# PERCENTAGE OF NEW OUTLIERS DETECTED #
from sklearn.neighbors import LocalOutlierFactor

In [101]:
gammas = [2, 3, 4, 6, 8, 10, 12]
n_features = [3, 5, 7, 9, 11, 12, 15]
seed = 101
pct = 25

In [102]:
rows = []
for g in gammas:
    for n in n_features:

        df_modified, idx_modified = ogf.perturb_within_distribution(
            original_data=df,
            pct_to_perturb=pct,
            target_column='target',
            feats_to_perturb=n,
            gamma=g,
            random_seed=seed,
            decimal_places=6)

        df_modified['is_perturbed'] = -1
        df_modified.loc[idx_modified, 'is_perturbed'] = 1

        X_modified = df_modified.drop(columns=['is_perturbed', 'target'])
        scaler = StandardScaler()
        X_modified_scaled = scaler.fit_transform(X_modified)

        lof = LocalOutlierFactor(contamination=pct*0.01, n_neighbors=20)
        # Mark detected outliers.
        df_modified['lof_detection'] = lof.fit_predict(X_modified_scaled)
        # Add information about the strength of outliers.
        # A smaller number indicates a stronger outlier.
        df_modified['lof_score'] = lof.negative_outlier_factor_
        mask = df_modified['lof_detection'] == -1
        # Sort outliers by lof_score ascending to check correlation later.
        outliers_sorted = df_modified[mask].sort_values('lof_score', ascending=True)

        ranked_indices = outliers_sorted.index.tolist()

        perturbed = df_modified[df_modified['is_perturbed'] == 1].shape[0]
        mask = (df_modified['is_perturbed'] == 1) & (df_modified['lof_detection'] == -1)
        detected = df_modified[mask].shape[0]

        detection = round((detected/perturbed)*100, 2)

        rows.append({
            'gamma': g,
            'n_features': n,
            'ranked_indices': ranked_indices,
            'detection': detection,
            'seed' : seed,
        })

parameters_df = pd.DataFrame(rows, columns=['gamma', 'n_features', 'ranked_indices','detection', 'seed'])

In [103]:
parameters_df

Unnamed: 0,gamma,n_features,ranked_indices,detection,seed
0,2,3,"[212, 174, 461, 152, 360, 213, 387, 317, 192, ...",64.08,101
1,2,5,"[398, 212, 248, 447, 550, 185, 461, 359, 281, ...",79.58,101
2,2,7,"[212, 174, 447, 410, 165, 380, 453, 297, 360, ...",84.51,101
3,2,9,"[473, 185, 212, 143, 211, 443, 325, 293, 74, 4...",88.73,101
4,2,11,"[424, 278, 224, 166, 204, 434, 359, 305, 307, ...",92.25,101
5,2,12,"[224, 104, 307, 169, 212, 217, 493, 442, 135, ...",92.96,101
6,2,15,"[522, 90, 93, 211, 212, 325, 447, 407, 296, 46...",96.48,101
7,3,3,"[174, 360, 317, 387, 212, 309, 132, 170, 432, ...",78.87,101
8,3,5,"[447, 398, 248, 550, 185, 452, 55, 359, 212, 2...",88.03,101
9,3,7,"[174, 380, 360, 453, 348, 410, 297, 447, 298, ...",92.96,101


In [104]:
target_detection = 97.00
tolerance = 1.00

selected = parameters_df[(parameters_df['detection'] >= (target_detection - tolerance)) & (parameters_df['detection'] <= (target_detection + tolerance))]
selected

Unnamed: 0,gamma,n_features,ranked_indices,detection,seed
6,2,15,"[522, 90, 93, 211, 212, 325, 447, 407, 296, 46...",96.48,101
11,3,11,"[278, 424, 434, 224, 166, 550, 307, 143, 217, ...",96.48,101
12,3,12,"[224, 307, 104, 217, 493, 442, 166, 135, 384, ...",96.48,101
13,3,15,"[522, 93, 90, 447, 322, 407, 296, 325, 7, 293,...",97.18,101
17,4,9,"[211, 293, 473, 512, 325, 522, 185, 305, 74, 3...",97.18,101
19,4,12,"[224, 104, 307, 442, 135, 217, 493, 166, 348, ...",97.89,101
22,6,5,"[447, 248, 550, 398, 452, 359, 211, 317, 463, ...",97.18,101
35,10,3,"[317, 360, 202, 170, 174, 536, 35, 132, 165, 3...",96.48,101
42,12,3,"[317, 360, 202, 170, 536, 132, 309, 174, 165, ...",97.18,101


In [105]:
g_1 = 3
n_1= 15

g_2 = 6
n_2 = 5

r1 = parameters_df.loc[(parameters_df['gamma']==g_1) & (parameters_df['n_features']==n_1), 'ranked_indices'].iloc[0]
r2 = parameters_df.loc[(parameters_df['gamma']==g_2) & (parameters_df['n_features']==n_2), 'ranked_indices'].iloc[0]

In [106]:
spearman_stats = {
    'g_1': g_1,
    'n_1': n_1,
    'g_2': g_2,
    'n_2': n_2,
    **hf.spearman_with_bootstrap(r1, r2)
}
spearman_stats

{'g_1': 3,
 'n_1': 15,
 'g_2': 6,
 'n_2': 5,
 'rho': 0.08383854682925705,
 'pval': 0.3212126525679271,
 'n': 142,
 'ci': (-0.09408901652883153, 0.2572036994949348)}

In [107]:
spearman_stats = pd.DataFrame([spearman_stats])
spearman_stats

Unnamed: 0,g_1,n_1,g_2,n_2,rho,pval,n,ci
0,3,15,6,5,0.083839,0.321213,142,"(-0.09408901652883153, 0.2572036994949348)"


In [108]:
import os

In [109]:
filepath = f"../../data/spearman_results.csv"

if not os.path.exists(filepath):
    spearman_stats.to_csv(filepath, index=False)
else:
    spearman_stats.to_csv(filepath, mode='a', header=False, index=False)