In [1]:
import numpy as np
import pandas as pd
from utils import helper_functions as hf

In [2]:
import importlib
importlib.reload(hf)

<module 'utils.helper_functions' from 'C:\\Users\\Artur\\Desktop\\MastersThesisProject\\utils\\helper_functions.py'>

In [1]:
outlier_pcts = np.arange(2, 52, 2)
dfs = {}

for pct in outlier_pcts:
    path = f"../../data/perturbed_datasets/iris_2_to_50_pct/iris_{pct}.csv"
    dfs[pct] = pd.read_csv(path)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [11]:
metric_scores = []

for pct, df in dfs.items():
    X = df.drop('target', axis=1)
    Y = df['target']

    # 10% is the final holdout test set. #
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.4, stratify=Y, random_state=101)

    # Allows performing scaling inside the grid_model. #
    pipe_steps = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', OneVsRestClassifier(
        LogisticRegression(solver='saga', max_iter=5000, random_state=101)))])

    param_grid = [
        {'logistic__estimator__penalty': ['l1', 'l2'],
         'logistic__estimator__C': np.logspace(-2, 2, 10)},
        {'logistic__estimator__penalty': ['elasticnet'],
         'logistic__estimator__l1_ratio': np.linspace(0.1, 1.0, 10),
         'logistic__estimator__C': np.logspace(-2, 2, 10)}]

    # The n_jobs=-1 allows using all available CPU cores.
    grid_model = GridSearchCV(
        pipe_steps, param_grid,
        cv=5, n_jobs=-1, verbose=1)

    grid_model.fit(X_train, Y_train)

    mets = hf.evaluate_model(grid_model, X_test, Y_test)
    mets['outlier_pct'] = pct
    metric_scores.append(mets)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 6

In [12]:
ms = pd.DataFrame(metric_scores).set_index('outlier_pct')

In [None]:
import matplotlib.pyplot as plt

In [1]:
plt.figure(figsize=(12, 8), dpi=100)
for metric in ['Accuracy', 'Precision', 'Recall', 'F1 Score']:
    plt.plot(ms.index, ms[metric], label=metric)
plt.xlabel('Percent of outliers introduced')
plt.ylabel('Metric value')
plt.title('Model performance depending on percentage of outliers introduced.')
plt.grid(True)
plt.legend()
plt.show()