In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [None]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
# To mark maligant label as 1 #
df['target'] = 1 - df['target']

In [None]:
# Select constant parameters to select directory
g = 2.0
p = 25

# Mark dictionary key as fetures count value for coresponding DataFrame
n_features = np.arange(1, 31, 1)

# Select metrics to plot
metrics = ['f1_score', 'balanced_accuracy', 'recall']

# True -> save metric results to data/features_variation_results.csv
save = True
model_name = 'forest'

In [None]:
dfs = {0 : df}
for f in n_features:
    path = f"../../data/perturbed_datasets/F[Var]_G[{g}]_P[{p}]/features_P[{p}]_F[{f}]_G[{g}].csv"
    dfs[f] = pd.read_csv(path)

In [None]:
dfs[15]

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, PredefinedSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime

In [None]:
from utils import helper_functions as hf
import importlib
importlib.reload(hf)

In [None]:
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=101)

metric_scores = []
for f, df in dfs.items():

    X = df.drop('target', axis=1)
    Y = df['target']

    folds_metric_scores = []
    # Outer U and T split #
    # Loop iteration represents one fold.
    for u_idx, t_idx in outer_cv.split(X, Y):
        X_u, Y_u = X.iloc[u_idx], Y.iloc[u_idx]
        X_t, Y_t = X.iloc[t_idx], Y.iloc[t_idx]

        # Inner X, V and T split #
        (X_x, X_v, Y_x, Y_v) = train_test_split(
            X_u, Y_u, test_size=0.2, stratify=Y_u, random_state=101)

        # Mask to mark training data as -1 and validation as 0.
        # Marking required by PredefinedSplit function below.
        v_fold = np.concatenate([np.full(len(X_x), -1, dtype=int),
                                    np.zeros(len(X_v), dtype=int)])

        # Predefined training/validation split.
        ps = PredefinedSplit(v_fold)

        # GridSearchCV expects (X, Y) as parameters #
        X_for_fit = pd.concat([X_x, X_v], axis=0)
        Y_for_fit = pd.concat([Y_x, Y_v], axis=0)

        # Scaling performed as a part of the process #
        pipe_steps = Pipeline([
        ('scaler', StandardScaler()),
        ('forest', RandomForestClassifier(random_state=101))])

        param_grid = {
            'forest__n_estimators': [25, 50, 75, 100, 125],
            'forest__criterion': ['gini', 'entropy'],
            'forest__max_features': ['sqrt', 3, 7],
            'forest__bootstrap': [True, False]
        }

        # Refit on whole outer training set U (X + V), to get the best model
        grid_model = GridSearchCV(pipe_steps, param_grid,
                            cv=ps, scoring='balanced_accuracy', n_jobs=-1, refit=True)
        grid_model.fit(X_for_fit, Y_for_fit)
        best_classifier = grid_model.best_estimator_

        this_fold_metrics = hf.evaluate_model(best_classifier, X_t, Y_t)
        folds_metric_scores.append(this_fold_metrics)

    fms = pd.DataFrame(folds_metric_scores)

    pct_summary = {
        'model': model_name,
        'pct': p,
        'gamma': g,
        'n_features': f,
        **fms.mean(axis=0).to_dict(),
        'timestamp': datetime.now().strftime("%d/%m/%Y")
    }

    metric_scores.append(pct_summary)

In [None]:
ms = pd.DataFrame(metric_scores)
ms.head(5)

In [None]:
# Adjust plot parameters
hf.plot_metrics(ms, metrics, 'n_features', 1, 'Wartość wzmocnienia', 'Wartość metryki')

In [None]:
import os

if save:
    filepath = f"../../data/features_variation_results.csv"

    if not os.path.exists(filepath):
        ms.to_csv(filepath, index=False)
    else:
        ms.to_csv(filepath, mode='a', header=False, index=False)