In [40]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [41]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

X = df.drop('target', axis=1)
Y = df['target']

In [42]:
from sklearn.preprocessing import StandardScaler

In [43]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [44]:
from sklearn.decomposition import PCA

In [45]:
pca_model = PCA(n_components=1)
pca_results = pca_model.fit_transform(X_scaled)

In [46]:
df_components = pd.DataFrame(data=pca_model.components_, index=['PC1'], columns=X.columns)

# REPRESENTS FIRST PRINCIPAL COPONENT (PC1) EXTRACTED FROM 30 FEATURES #
df_components

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
PC1,0.218902,0.103725,0.227537,0.220995,0.14259,0.239285,0.2584,0.260854,0.138167,0.064363,...,0.227997,0.104469,0.23664,0.224871,0.127953,0.210096,0.228768,0.250886,0.122905,0.131784


In [47]:
# 15 MOST IMPORTANT FEATURES 50% ACCORDING TO PC1 #

top_features = df_components.loc['PC1'].abs().sort_values(ascending=False)
top_feature_names = top_features.head(15).index.tolist()
print(top_features.head(15))

mean concave points     0.260854
mean concavity          0.258400
worst concave points    0.250886
mean compactness        0.239285
worst perimeter         0.236640
worst concavity         0.228768
worst radius            0.227997
mean perimeter          0.227537
worst area              0.224871
mean area               0.220995
mean radius             0.218902
perimeter error         0.211326
worst compactness       0.210096
radius error            0.205979
area error              0.202870
Name: PC1, dtype: float64


In [48]:
top_feature_names

['mean concave points',
 'mean concavity',
 'worst concave points',
 'mean compactness',
 'worst perimeter',
 'worst concavity',
 'worst radius',
 'mean perimeter',
 'worst area',
 'mean area',
 'mean radius',
 'perimeter error',
 'worst compactness',
 'radius error',
 'area error']

In [49]:
from utils import outliers_generation_functions as ogf
import importlib
importlib.reload(ogf)

<module 'utils.outliers_generation_functions' from 'C:\\Users\\Artur\\Desktop\\PythonProject\\utils\\outliers_generation_functions.py'>

In [50]:
########################################################
# PERTURB MOST 15 IMPORTANT FEATURES ACCORDING TO PCA1 #
########################################################

In [51]:
percentages = np.arange(1, 51, 1)

for p in percentages:
    df_out = ogf.perturb_within_distribution(
        original_data=df,
        pct_to_perturb=p,
        target_column='target',
        feats_to_perturb=top_feature_names,
        gamma=12.0,
        random_seed=101,
        decimal_places=6,
        save=True,
        directory_name='BCW_1to50pct_15of30_g12_pca',
        key_word='perturbed'
    )