In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/rupali/Documents/Master Thesis/jenga')

## Dataset

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [3]:
dataset = Dataset("parkinsons")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: parkinsons
Found 0 categorical and 22 numeric features 



### Get training and test sets

In [4]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

## Model

In [5]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

## Corruptions

In [6]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, SwappedValues, CategoricalShift, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [12]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Generating corrupted training data on 59 rows... 

	perturbation: MissingValues: {'column': 'V22', 'fraction': 0.5, 'sampling': 'MCAR', 'na_value': nan}
	perturbation: SwappedValues: {'column': 'V13', 'fraction': 0.5, 'sampling': 'MAR', 'swap_with': None}
	perturbation: CategoricalShift: {'column': 'V4', 'fraction': 0.5, 'sampling': 'MCAR'}
CategoricalShift implemented only for categorical variables
	perturbation: Scaling: {'column': 'V18', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: GaussianNoise: {'column': 'V9', 'fraction': 0.5, 'sampling': 'MAR'}

Generating corrupted training data on 59 rows... 

	perturbation: MissingValues: {'column': 'V13', 'fraction': 0.5, 'sampling': 'MNAR', 'na_value': nan}
	perturbation: SwappedValues: {'column': 'V17', 'fraction': 0.5, 'sampling': 'MCAR', 'swap_with': None}
	perturbation: CategoricalShift: {'column': 'V22', 'fraction': 0.5, 'sampling': 'MAR'}
CategoricalShift implemented

## Cleaning

In [13]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection#, AutoGluonOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation#, AutoGluonImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    # (PyODKNNOutlierDetection, AutoGluonImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    # (PyODIsolationForestOutlierDetection, AutoGluonImputation),
    # (AutoGluonOutlierDetection, MeanModeImputation)
    # (AutoGluonOutlierDetection, AutoGluonImputation)
]

In [14]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)


Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.5987179487179487, 'classification_report': {'1': {'precision': 0.42857142857142855, 'recall': 0.15, 'f1-score': 0.2222222222222222, 'support': 20}, '2': {'precision': 0.6730769230769231, 'recall': 0.8974358974358975, 'f1-score': 0.7692307692307692, 'support': 39}, 'accuracy': 0.6440677966101694, 'macro avg': {'precision': 0.5508241758241759, 'recall': 0.5237179487179487, 'f1-score': 0.4957264957264957, 'support': 59}, 'weighted avg': {'precision': 0.5901937046004844, 'recall': 0.6440677966101694, 'f1-score': 0.583804143126177, 'support': 59}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.30847457627118646, 'Recall': 0.6, 'F1-score': 0.3619301711201761, 'Accuracy': 0.4169491525423729}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 0.9416702953595013}
Cle

## Outlier and Imputation scores

In [None]:
df_outliers

In [None]:
df_cleaned

In [None]:
from sklearn.metrics import mean_squared_error

mse = []
for col in cols_perturbed:
    if col in numerical_columns:
        mse.append(mean_squared_error(test_data[col], df_cleaned[col]))

print(mse)
np.mean(mse)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_data["savings_status"], df_cleaned["savings_status"], output_dict=True))

In [None]:
f1socres = []
for col in categorical_columns:
  print(col)
  labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]
  print(labels)

  f1s = []
  for label in labels:
    f1s.append(classif_reports[col][label]['f1-score'])
  print(f1s)
  
  print(np.mean(f1s))
  f1socres.append(np.mean(f1s))

print(f1socres)
np.mean(f1socres)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

classif_reports = {}
acc_scores = []
mse = []

for col in cols_perturbed:
  if col in categorical_columns:
    classif_reports[col] = classification_report(test_data[col], df_cleaned[col], output_dict=True)
    acc_scores.append(accuracy_score(test_data[col], df_cleaned[col]))
  else:
    mse.append(mean_squared_error(test_data[col], df_cleaned[col]))

In [None]:
classif_reports['checking_status']['<0']['f1-score']

In [None]:
f1socres = []
recallscores = []
precisionscores = []

for col in cols_perturbed:
  if col in categorical_columns:
    labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

    f1s = []
    res = []
    pres = []

    for label in labels:
      f1s.append(classif_reports[col][label]['f1-score'])
      res.append(classif_reports[col][label]['recall'])
      pres.append(classif_reports[col][label]['precision'])
    
    f1socres.append(np.mean(f1s))
    recallscores.append(np.mean(res))
    precisionscores.append(np.mean(pres))

print(f"Mean f1-score: {np.mean(f1socres)}")
print(f"Mean Recall: {np.mean(recallscores)}")
print(f"Mean Precision: {np.mean(precisionscores)}")
print(f"Mean Accuracy: {np.mean(acc_scores)}\n")

print(f"Mean MSE: {np.mean(mse)}")

In [None]:
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

def imputation_scores(df_test, df_cleaned, cols_perturbed, categorical_columns):
  classif_reports = {}

  acc_scores = []
  f1socres = []
  recallscores = []
  precisionscores = []

  mse = []

  for col in cols_perturbed:
    if col in categorical_columns:
      classif_reports[col] = classification_report(df_test[col], df_cleaned[col], output_dict=True)

      labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

      f1s = []
      res = []
      pres = []

      for label in labels:
        f1s.append(classif_reports[col][label]['f1-score'])
        res.append(classif_reports[col][label]['recall'])
        pres.append(classif_reports[col][label]['precision'])
      
      f1socres.append(np.mean(f1s))
      recallscores.append(np.mean(res))
      precisionscores.append(np.mean(pres))

      acc_scores.append(accuracy_score(df_test[col], df_cleaned[col]))
    else:
      mse.append(mean_squared_error(df_test[col], df_cleaned[col]))

  imputation_scores_summ = {
      "Precision": np.mean(precisionscores),
      "Recall": np.mean(recallscores),
      "F1-score": np.mean(f1socres),
      "Accuracy": np.mean(acc_scores),
      "Mean Squared Error": np.mean(mse)
  }

  return imputation_scores_summ

In [None]:
imputation_scores(test_data, df_cleaned, cols_perturbed, categorical_columns)

In [None]:
test_data_out = test_data.copy()
test_data_out

In [None]:
## compare the corrupted and original column values for manual outlier detection: add binary column, compare with df_outlier _outlier columns

In [None]:
test_data["residence_since"]

In [None]:
df_corrupted["residence_since"]

In [None]:
df_outliers[["residence_since", "residence_since_outlier"]]

In [None]:
outiers_man = np.equal(test_data["residence_since"], df_corrupted["residence_since"])
outiers_man_ind = outiers_man.index[outiers_man == False]
print(outiers_man_ind)

non_outliers_man_ind = test_data_out.loc[set(test_data_out.index) - set(outiers_man_ind)].index
print(non_outliers_man_ind)

In [None]:
test_data_out.loc[outiers_man_ind, "residence_since_outlier"] = 1 ## outliers
test_data_out.loc[non_outliers_man_ind, "residence_since_outlier"] = 0 ## not outliers

In [None]:
pd.concat([df_corrupted["residence_since"], test_data_out[["residence_since", "residence_since_outlier"]], df_outliers["residence_since_outlier"]], axis=1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_data_out["residence_since_outlier"], df_outliers["residence_since_outlier"]))

In [None]:
from sklearn.metrics import classification_report

test_data_out = test_data.copy()

classif_reports = {}
acc_scores = []
f1socres = []
recallscores = []
precisionscores = []

for col in cols_perturbed:
  print(col)
  outiers_man = np.equal(test_data[col], df_corrupted[col])
  outiers_man_ind = outiers_man.index[outiers_man == False]
  print(outiers_man_ind)

  non_outliers_man_ind = test_data_out.loc[set(test_data_out.index) - set(outiers_man_ind)].index
  print(non_outliers_man_ind)

  test_data_out.loc[outiers_man_ind, col + "_outlier"] = 1 ## outliers
  test_data_out.loc[non_outliers_man_ind, col + "_outlier"] = 0 ## not outliers

  print(classification_report(test_data_out[col + "_outlier"], df_outliers[col + "_outlier"]))
  classif_reports[col] = classification_report(test_data_out[col + "_outlier"], df_outliers[col + "_outlier"], output_dict=True)

  labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

  f1s = []
  res = []
  pres = []

  for label in labels:
    f1s.append(classif_reports[col][label]['f1-score'])
    res.append(classif_reports[col][label]['recall'])
    pres.append(classif_reports[col][label]['precision'])
  
  f1socres.append(np.mean(f1s))
  recallscores.append(np.mean(res))
  precisionscores.append(np.mean(pres))

  acc_scores.append(accuracy_score(test_data_out[col + "_outlier"], df_outliers[col + "_outlier"]))

print(f"Mean f1-score: {np.mean(f1socres)}")
print(f"Mean Recall: {np.mean(recallscores)}")
print(f"Mean Precision: {np.mean(precisionscores)}")
print(f"Mean Accuracy: {np.mean(acc_scores)}\n")


In [None]:
test_data_out

In [None]:
def outlier_detection_scores(df_test, df_corrupted, df_outliers, cols_perturbed):
  df_test_out = df_test.copy()

  classif_reports = {}
  acc_scores = []
  f1socres = []
  recallscores = []
  precisionscores = []

  for col in cols_perturbed:
    outiers_man = np.equal(df_test[col], df_corrupted[col])
    outiers_man_ind = outiers_man.index[outiers_man == False]

    non_outliers_man_ind = df_test_out.loc[set(df_test_out.index) - set(outiers_man_ind)].index

    df_test_out.loc[outiers_man_ind, col + "_outlier"] = 1 ## outliers
    df_test_out.loc[non_outliers_man_ind, col + "_outlier"] = 0 ## not outliers

    classif_reports[col] = classification_report(df_test_out[col + "_outlier"], df_outliers[col + "_outlier"], output_dict=True)

    labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

    f1s = []
    res = []
    pres = []

    for label in labels:
      f1s.append(classif_reports[col][label]['f1-score'])
      res.append(classif_reports[col][label]['recall'])
      pres.append(classif_reports[col][label]['precision'])
    
    f1socres.append(np.mean(f1s))
    recallscores.append(np.mean(res))
    precisionscores.append(np.mean(pres))

    acc_scores.append(accuracy_score(df_test_out[col + "_outlier"], df_outliers[col + "_outlier"]))

  od_scores_summ = {
      "Precision": np.mean(precisionscores),
      "Recall": np.mean(recallscores),
      "F1-score": np.mean(f1socres),
      "Accuracy": np.mean(acc_scores)
  }

  return od_scores_summ

In [None]:
outlier_detection_scores(test_data, df_corrupted, df_outliers, cols_perturbed)