This notebook contains the code for a number of naive cleaning tchniques intended to improve the effectiveness of remaining time prediction on polluted datasets.

# Setup

##Imports

In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import time
import statistics
import pickle

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.spatial import distance as scipy_distance

from gensim.parsing.preprocessing import remove_stopwords
import fasttext
import fasttext.util
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# init stemmer
lemmatizer = WordNetLemmatizer()

#fasttext.util.download_model('en', if_exists='ignore')  # English (long to import) (uncomment to import the first time)
nlp_model = fasttext.load_model('cc.en.300.bin')

#!pip install -q python-Levenshtein
import Levenshtein as lev

%matplotlib inline

In [None]:
## Takes a while to download, only run for SYNONYMOUS
#import fasttext
#import fasttext.util
#
#fasttext.util.download_model('en', if_exists='ignore') # English
#model = fasttext.load_model('cc.en.300.bin')
#TODO: try other models

## Globals

In [None]:
pollution_types = ["DISTORTED", "POLLUTED.NORND", "POLLUTED.RANDOM"]
percentages = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
dq = ["DISTINCT", "UNIQUENESS", "CONSTANCY"]
dq_scorings = ["accuracy", "precision", "recall", "f1"]
metrics = dq+dq_scorings
metrics_time = metrics + ["time"]

## Functions

In [None]:
def DQ_assess(df, metrics): # might be renamed to "profiling" in the future
  """
  Returns intrinsic DQ metrics for a list of labels
  """

  output = {}

  labels = df.CCDOEV

  DISTINCT = labels.nunique() #
  COUNT = labels.count()
  ROWS = len(labels) # same
  MAX_COUNTS = max(labels.value_counts())
  AVG_EVENTS = df.groupby("NUMPRO").size().mean()
  AVG_LENGTH = labels.apply(len).mean()
  TRACES = df.NUMPRO.nunique()

  if "COUNT" in metrics:
    output["COUNT"] = COUNT
  if "DISTINCT" in metrics:
    output["DISTINCT"] = DISTINCT
  if "UNIQUENESS" in metrics:
    UNIQUENESS = DISTINCT / ROWS
    output["UNIQUENESS"] = UNIQUENESS
  if "CONSTANCY" in metrics:
    CONSTANCY = MAX_COUNTS / COUNT
    output["CONSTANCY"] = CONSTANCY
  if "COMPLEXITY" in metrics:
    COMPLEXITY = AVG_EVENTS / TRACES
    output["COMPLEXITY"] = COMPLEXITY
  if "CONSISTENCY" in metrics:
    distances = labels.apply(len)-AVG_LENGTH
    CONSISTENCY = distances.sum() / ROWS
    output["CONSISTENCY"] = CONSISTENCY

  return output

In [None]:
my_nlp_preprocess_2 = lambda s: ''.join([lemmatizer.lemmatize(word)+' ' for word in s.split()])[:-1].lower()

embedding_db = {}
def get_embedding(sw):
  """sw can be either a string or a word"""
  if sw in embedding_db:
    return embedding_db[sw]
  else:
    if len(sw.split()) > 1: #check if sentence or single word
      embedding = nlp_model.get_sentence_vector(sw)
      embedding_db[sw] = embedding
    else:
      embedding = nlp_model.get_word_vector(sw)
      embedding_db[sw] = embedding
    return embedding

def get_antonyms(word):

  antonyms = []
  for syn in wordnet.synsets(word):
      for i in syn.lemmas():
          if i.antonyms():
                antonyms.append(i.antonyms()[0].name())

  antonyms_full = antonyms.copy()
  for anto in antonyms:
    for syn in wordnet.synsets(anto):
        for lm in syn.lemmas():
            antonyms_full.append(lm.name())

  return set(antonyms_full)

anto_db = {}
def my_distance(s1, s2):
  """
  Returns semantic distance between two strings s1 and s2, with penalty for antonyms
  """
  words1 = s1.split()
  words2 = s2.split()
  antos1 = []
  antos2 = []

  penalty = 0 # temp

  for word in words1:
    if word in anto_db:
      antos1 += anto_db[word]
    else:
      temp = list(get_antonyms(word))
      antos1 += temp
      anto_db[word] = temp

  for word in words2:
    if word in anto_db:
      antos2 += anto_db[word]
    else:
      temp = list(get_antonyms(word))
      antos2 += temp
      anto_db[word] = temp

  penalty = 0.25*(len(set(words1).intersection(set(antos2)))+len(set(words2).intersection(set(antos1)))) #add a 0.25 penalty for each pair of antonyms found

  return penalty + scipy_distance.cosine(get_embedding(s1), get_embedding(s2))

In [None]:
def DBSCAN_labels(labels, distance="levenshtein", threshold=1, preprocessing=None, sem_dist=None):
  """
  Takes a list of labels as input, returns a clustering dict, and a dictionary containing all distances already computed
  """

  ids = {s: None for s in labels} # Cluster IDs
  k = 0
  clustered = set() # Set of labels that have been assigned to a cluster
  not_clustered = list(labels) # Labels that have yet to belong to a cluster

  distances_dict = {}

  if distance=="semantic":
    if preprocessing is None:
        preprocessing = lambda s: s

    if sem_dist is None:
        sem_dist = lambda s1, s2: scipy_distance.cosine(get_embedding(preprocessing(s1)), get_embedding(preprocessing(s2)))

    distances_dict |= {(s1,s2):sem_dist(preprocessing(s1), preprocessing(s2)) for s1 in labels for s2 in labels} # precompute if semantic (TO_CHECK)

  def get_distance(s1, s2, distance): # So we do not compute the same distance twice. # move it so cleaning() can use it
      """
      Returns some distance between two strings s1 and s2, checking first if the distance was not computed before
      """
      if (s1, s2) in distances_dict:
          return distances_dict[(s1, s2)]
      elif (s2,s1) in distances_dict:
          return distances_dict[(s2, s1)]
      else:
          if distance == "levenshtein":
              dist = lev.distance(s1, s2)
          elif distance == "hamming": # Not used
              dist = round(scipy_distance.hamming(list(s1), list(s2)) * len(list(s1)))
          elif distance == 'semantic': # Very costly when labels diversity is huge
              # Need to define global nlp_model beforehand
              dist = distances_dict[(s1,s2)]

          distances_dict[(s1, s2)] = dist
          return dist

  while not_clustered: # Stops when all labels belong to a cluster
      to_check = [not_clustered[0]] # We need to select an initial datapoint for clustering
      while to_check: # Stops when empty
          x = to_check.pop(0)
          closest_labels = [string for string in not_clustered if (get_distance(x, string, distance) <= threshold)]

          for y in closest_labels:
              if y not in clustered:
                  ids[y] = k # Add label y to cluster k
                  to_check.append(y) # We will need to check datapoints around y
                  clustered.add(y)

      not_clustered = [x for x in not_clustered if x not in clustered]
      k += 1 # next cluster

  return {index: list(set([x for x in labels if ids[x] == index])) for index in range(k)} # list(set()) to remove duplicates

In [None]:
def cluster_most_common(clustering, frequencies):
  """
  Returns a dict matching each label in clustering to its most common lookalike found in clustering
  """
  C_better = {} # Match each string to its most common lookalike found in its cluster
  for cluster in clustering.values(): # find most common in cluster
    cluster_with_frequencies = [(activity,frequencies[activity]) for activity in cluster]
    cluster_with_frequencies.sort(key=lambda x: x[1], reverse=True)
    most_common = cluster_with_frequencies[0][0]
    for activity in cluster:
      C_better[activity] = most_common

  return C_better

In [None]:
def conf_matrix(clean, dirty):
  """
  Returns classification metrics for a list of labels compared to a reference
  """
  test_accuracy = round(accuracy_score(clean, dirty),4)
  test_precision = round(precision_score(clean, dirty, average='weighted', zero_division=np.nan),4)
  test_recall = round(recall_score(clean, dirty, average='weighted', zero_division=np.nan),4)
  test_f1 = round(f1_score(clean, dirty, average='weighted'),4)

  return test_accuracy, test_precision, test_recall, test_f1

In [None]:
# TODO: split between cleaning and scoring
def cleaning(pollution_type, reference_labels, technique="DBSCAN", distance="levenshtein", threshold=0, dataset="BPIC11_f1", language="nl", limit=8, preprocessing=None, sem_dist=None):
  """
  Cleans a list of labels and returns its scores. Also saves cleaned dataset
  """
  initial_time = time.time()
  # distances_dict = {} takes way too much memory

  #if technique == "languageTool":
  #  tool = language_tool_python.LanguageTool(language)

  scores_matrix_cleaned = {}
  for percentage in percentages:
    scores_matrix_cleaned[percentage] = {}
    for i in range(limit):
      start_time = time.time()
      df = pd.read_csv(f"{directory_path}{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      labels = df.CCDOEV
      freq = labels.value_counts() # We need labels frequencies to either identify the most common one in a cluster or rare labels

      if (technique == "DBSCAN"):
        clustering = DBSCAN_labels(labels.unique(), distance, threshold, preprocessing=preprocessing, sem_dist=sem_dist) # clustering

        mapping = cluster_most_common(clustering, freq.to_dict())

      #elif (technique == "languageTool"): # Spellcheck (too slow)
      #  mapping = {s:language_tool_python.utils.correct(s, tool.check(s)) for s in labels.unique()}

      elif (technique=="DROP"): # Replace rare labels with empty strings
        mapping = {}
        for s in labels.unique():
          if freq[s] >= threshold*len(labels):
            mapping[s] = s
          else:
            mapping[s] = ""

      elif (technique=="DROP_DELETE"): # Remove rows with rare labels
        df = df[df.CCDOEV.isin(freq[freq>threshold*len(labels)].index)].reset_index(drop=True)
        mapping = {s:s for s in labels.unique()}

      cleaned_labels = labels.replace(mapping) # TO_CHECK: map changed to replace
      df.CCDOEV = cleaned_labels

      # Saving cleaned dataset
      df.to_csv(f"./{dataset}/{dataset}/{dataset}_cleaned/{dataset}-TRAIN-{pollution_type}-{percentage}-{i}_CLEANED-{distance}-{threshold}.csv", index=False)
      # Example: ./BPIC11_f1/BPIC11_f1/BPIC11_f1_cleaned/BPIC11_f1-TRAIN-DISTORTED-0.1-1_CLEANED-levenshtein-1.csv

      scores_matrix_cleaned[percentage][i] =  DQ_assess(df, dq) | dict(zip(dq_scorings, conf_matrix(reference_labels, cleaned_labels)))

      elapsed = time.time() - start_time
      scores_matrix_cleaned[percentage][i]["time"] = elapsed

      print(f"{percentage}: {i+1}/{limit} done in {round(elapsed/60,3)} minutes")

  print("Total time:", (round((time.time() - initial_time)/60,3)), "minutes")

  if (technique != "DROP_DELETE"): # No scores for DROP_DELETE since the length of the dataset changes.
    scores_matrix_cleaned_avg = {}
    for percentage in percentages:
      scores_matrix_cleaned_avg[percentage] = {}
      for metric in metrics_time:
        scores_matrix_cleaned_avg[percentage][metric] = np.mean([scores_matrix_cleaned[percentage][i][metric] for i in range(limit)])
    return scores_matrix_cleaned_avg

In [None]:
embedding_db = {}

def get_embedding(sw):
  """sw can be either a string or a word"""
  if sw in embedding_db:
    return embedding_db[sw]
  else:
    if len(sw.split()) > 1: #check if sentence or single word
      embedding = nlp_model.get_sentence_vector(sw)
      embedding_db[sw] = embedding
    else:
      embedding = nlp_model.get_word_vector(sw)
      embedding_db[sw] = embedding
    return embedding

def get_antonyms(word):

  antonyms = []

  for syn in wordnet.synsets(word):
      for i in syn.lemmas():
          if i.antonyms():
                antonyms.append(i.antonyms()[0].name())

  antonyms_full = antonyms.copy()
  for anto in antonyms:
    for syn in wordnet.synsets(anto):
        for lm in syn.lemmas():
            antonyms_full.append(lm.name())

  return set(antonyms_full)

anto_db = {}

def my_distance(s1, s2):
  """
  Returns semantic distance between two strings s1 and s2, with penalty for antonyms
  """
  words1 = s1.split()
  words2 = s2.split()
  antos1 = []
  antos2 = []

  penalty = 0 # temp

  for word in words1:
    if word in anto_db:
      antos1 += anto_db[word]
    else:
      temp = list(get_antonyms(word))
      antos1 += temp
      anto_db[word] = temp

  for word in words2:
    if word in anto_db:
      antos2 += anto_db[word]
    else:
      temp = list(get_antonyms(word))
      antos2 += temp
      anto_db[word] = temp

  penalty = 0.25*(len(set(words1).intersection(set(antos2)))+len(set(words2).intersection(set(antos1)))) #add a 0.25 penalty for each pair of antonyms found

  return penalty + scipy_distance.cosine(get_embedding(s1), get_embedding(s2))

In [None]:
from scipy.spatial import distance as scipy_distance
from gensim.parsing.preprocessing import remove_stopwords
import fasttext
import fasttext.util
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# init stemmer
porter_stemmer=PorterStemmer()

lemmatizer = WordNetLemmatizer()

fasttext.util.download_model('en', if_exists='ignore')  # English
nlp_model = fasttext.load_model('cc.en.300.bin')

def patterner(df):
    """Creates a list of 3-tuples of labels"""
    labels = df['CCDOEV'].values
    patterns = list(zip(labels[:-2], labels[1:-1], labels[2:]))
    return patterns

def patterns_distance(p1, p2, distance_matrix={}):
    """Calculates some semantic distance between patterns"""
    dist = 0
    for i in range(len(p1)): # should always be 3
        dist += distance_matrix.get((p1[i], p2[i]), my_distance(p1[i], p2[i])) # Note à soi-même: utiliser .get() davantage
    return dist

def rebuild_labels(patterns):
    """Rebuilds labels from patterns"""
    return [patterns[0][0]] + [pattern[1] for pattern in patterns] + [patterns[-1][-1]]

def homo_cleaning(df):
  """Cleans a SYNONYM- or HOMONYM-corrupted dataset"""
  start_time = time.time()

  patterns = patterner(df)
  patterns_frequencies = {pattern:0 for pattern in patterns}
  for pattern in patterns: # Count occurences of each pattern
    patterns_frequencies[pattern]+=1

  c = df['CCDOEV'].value_counts().values
  ratio = c.std() / c.mean()

  threshold = 0.005 if ratio < 1 else 0.01
  thresh = int(len(patterns) * threshold)

  frequent_patterns = [pattern for pattern, freq in patterns_frequencies.items() if freq >= thresh]
  rare_patterns = [pattern for pattern, freq in patterns_frequencies.items() if freq < thresh]

  mapping = {}

  for x in set(patterns):
    mapping[x] = x
    if x in rare_patterns:
      min = np.inf
      for y in frequent_patterns:
        if patterns_distance(x,y) < min:
          min = patterns_distance(x,y)
          mapping[x] = y

  df_final = df.copy()
  rebuilt_patterns = [mapping[pattern] for pattern in patterns]
  df_final["CCDOEV"] = rebuild_labels(rebuilt_patterns)

  return df_final

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pokro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# TODO: split between cleaning and scoring
def cleaning_patterns(pollution_type, reference_labels, dataset="BPIC11_f1", limit=8):
  """
  Cleans a list of labels and returns its scores. Also saves cleaned dataset
  """
  initial_time = time.time()

  scores_matrix_cleaned = {}
  for percentage in percentages:
    scores_matrix_cleaned[percentage] = {}
    for i in range(limit):
      start_time = time.time()
      df = pd.read_csv(f"{directory_path}{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      labels = df.CCDOEV
      freq = labels.value_counts() # We need labels frequencies to either identify the most common one in a cluster or rare labels

      df = homo_cleaning(df)

      # Saving cleaned dataset
      df.to_csv(f"./{dataset}/{dataset}/{dataset}_cleaned/{dataset}-TRAIN-{pollution_type}-{percentage}-{i}_CLEANED-{distance}.csv", index=False)
      # Example: ./BPIC11_f1/BPIC11_f1/BPIC11_f1_cleaned/BPIC11_f1-TRAIN-DISTORTED-0.1-1_CLEANED-levenshtein-1.csv

      scores_matrix_cleaned[percentage][i] = DQ_assess(df, dq) | dict(zip(dq_scorings, conf_matrix(reference_labels, df.CCDOEV)))

      elapsed = time.time() - start_time
      scores_matrix_cleaned[percentage][i]["time"] = elapsed

      print(f"{percentage}: {i+1}/{limit} done in {round(elapsed/60,3)} minutes")

  print("Total time:", (round((time.time() - initial_time)/60,3)), "minutes")

  scores_matrix_cleaned_avg = {}
  for percentage in percentages:
    scores_matrix_cleaned_avg[percentage] = {}
    for metric in metrics_time:
      scores_matrix_cleaned_avg[percentage][metric] = np.mean([scores_matrix_cleaned[percentage][i][metric] for i in range(limit)])
  return scores_matrix_cleaned_avg

# BPIC '11

## Overview

### Defines

In [None]:
dataset = "BPIC11_f1"

In [None]:
directory_path = f'./{dataset}/{dataset}/{dataset}_prepared/'
files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [None]:
files.remove(f'{dataset}-TRAIN-CLEAN.csv')

In [None]:
df_ref = pd.read_csv(f'./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv')
ref = df_ref.CCDOEV

### Baseline

In [None]:
# Compute DQ metrics for all corrupted datasets, to use as a baseline
dq_dict = {}
for pollution_type in pollution_types:
  dq_dict[pollution_type] = {}
  for percentage in percentages:
    dq_dict[pollution_type][percentage] = {}
    for i in range(8):
      df = pd.read_csv(f"{directory_path}{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      labels = df.CCDOEV
      dq_dict[pollution_type][percentage][i] = DQ_assess(df, dq)| dict(zip(dq_scorings, conf_matrix(ref, labels)))

In [None]:
# Average the metrics
dq_dict_avg = {}
for pollution_type in pollution_types:
  dq_dict_avg[pollution_type] = {}
  for percentage in percentages:
    dq_dict_avg[pollution_type][percentage] = {}
    for metric in metrics:
      dq_dict_avg[pollution_type][percentage][metric] = np.mean([dq_dict[pollution_type][percentage][i][metric] for i in range(8)])

scorings = DQ_assess(df_ref, dq) | dict(zip(dq_scorings, conf_matrix(ref, ref)))

dq_dict_avg["CLEAN"] = {percentage:scorings for percentage in percentages}

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}-REF_scores_matrix_avg.pickle", "wb") as f:
  pickle.dump(dq_dict_avg, f)

## Levenshtein clustering (d=1)

In [None]:
distance = "levenshtein" # should be renamed to 'distance'
technique = "DBSCAN"
threshold = 1
scores_matrix_levenshtein1_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
scores_matrix_levenshtein1_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.014 minutes
0.05: 2/8 done in 0.014 minutes
0.05: 3/8 done in 0.014 minutes
0.05: 4/8 done in 0.014 minutes
0.05: 5/8 done in 0.014 minutes
0.05: 6/8 done in 0.014 minutes
0.05: 7/8 done in 0.014 minutes
0.05: 8/8 done in 0.014 minutes
0.1: 1/8 done in 0.037 minutes
0.1: 2/8 done in 0.038 minutes
0.1: 3/8 done in 0.038 minutes
0.1: 4/8 done in 0.039 minutes
0.1: 5/8 done in 0.038 minutes
0.1: 6/8 done in 0.038 minutes
0.1: 7/8 done in 0.038 minutes
0.1: 8/8 done in 0.037 minutes
0.2: 1/8 done in 0.108 minutes
0.2: 2/8 done in 0.113 minutes
0.2: 3/8 done in 0.108 minutes
0.2: 4/8 done in 0.108 minutes
0.2: 5/8 done in 0.108 minutes
0.2: 6/8 done in 0.109 minutes
0.2: 7/8 done in 0.106 minutes
0.2: 8/8 done in 0.106 minutes
0.3: 1/8 done in 0.205 minutes
0.3: 2/8 done in 0.214 minutes
0.3: 3/8 done in 0.209 minutes
0.3: 4/8 done in 0.206 minutes
0.3: 5/8 done in 0.206 minutes
0.3: 6/8 done in 0.206 minutes
0.3: 7/8 done in 0.206 minutes
0.3: 8/8 done in 0.204 minutes


### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
scores_matrix_levenshtein1_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.026 minutes
0.05: 2/8 done in 0.027 minutes
0.05: 3/8 done in 0.027 minutes
0.05: 4/8 done in 0.026 minutes
0.05: 5/8 done in 0.026 minutes
0.05: 6/8 done in 0.026 minutes
0.05: 7/8 done in 0.026 minutes
0.05: 8/8 done in 0.025 minutes
0.1: 1/8 done in 0.092 minutes
0.1: 2/8 done in 0.103 minutes
0.1: 3/8 done in 0.091 minutes
0.1: 4/8 done in 0.09 minutes
0.1: 5/8 done in 0.089 minutes
0.1: 6/8 done in 0.089 minutes
0.1: 7/8 done in 0.089 minutes
0.1: 8/8 done in 0.089 minutes
0.2: 1/8 done in 0.387 minutes
0.2: 2/8 done in 0.39 minutes
0.2: 3/8 done in 0.371 minutes
0.2: 4/8 done in 0.369 minutes
0.2: 5/8 done in 0.396 minutes
0.2: 6/8 done in 0.388 minutes
0.2: 7/8 done in 0.391 minutes
0.2: 8/8 done in 0.38 minutes
0.3: 1/8 done in 0.921 minutes
0.3: 2/8 done in 0.938 minutes
0.3: 3/8 done in 0.9 minutes
0.3: 4/8 done in 0.904 minutes
0.3: 5/8 done in 0.913 minutes
0.3: 6/8 done in 0.913 minutes
0.3: 7/8 done in 0.862 minutes
0.3: 8/8 done in 0.904 minutes
0.4: 

### Polluted Random

In [None]:
pollution_type = pollution_types[2]
scores_matrix_levenshtein1_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.004 minutes
0.05: 2/8 done in 0.004 minutes
0.05: 3/8 done in 0.004 minutes
0.05: 4/8 done in 0.004 minutes
0.05: 5/8 done in 0.004 minutes
0.05: 6/8 done in 0.004 minutes
0.05: 7/8 done in 0.004 minutes
0.05: 8/8 done in 0.004 minutes
0.1: 1/8 done in 0.004 minutes
0.1: 2/8 done in 0.004 minutes
0.1: 3/8 done in 0.004 minutes
0.1: 4/8 done in 0.005 minutes
0.1: 5/8 done in 0.004 minutes
0.1: 6/8 done in 0.004 minutes
0.1: 7/8 done in 0.004 minutes
0.1: 8/8 done in 0.004 minutes
0.2: 1/8 done in 0.005 minutes
0.2: 2/8 done in 0.01 minutes
0.2: 3/8 done in 0.005 minutes
0.2: 4/8 done in 0.005 minutes
0.2: 5/8 done in 0.005 minutes
0.2: 6/8 done in 0.005 minutes
0.2: 7/8 done in 0.005 minutes
0.2: 8/8 done in 0.005 minutes
0.3: 1/8 done in 0.006 minutes
0.3: 2/8 done in 0.005 minutes
0.3: 3/8 done in 0.005 minutes
0.3: 4/8 done in 0.005 minutes
0.3: 5/8 done in 0.005 minutes
0.3: 6/8 done in 0.005 minutes
0.3: 7/8 done in 0.005 minutes
0.3: 8/8 done in 0.005 minutes
0

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{distance}-{threshold}_avg.pickle", "wb") as f:
  pickle.dump(scores_matrix_levenshtein1_avg, f)

## Levenshtein clustering (d=5)

In [None]:
distance = "levenshtein"
technique = "DBSCAN"
threshold = 5
scores_matrix_levenshtein5_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
scores_matrix_levenshtein5_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.014 minutes
0.05: 2/8 done in 0.014 minutes
0.05: 3/8 done in 0.015 minutes
0.05: 4/8 done in 0.014 minutes
0.05: 5/8 done in 0.014 minutes
0.05: 6/8 done in 0.014 minutes
0.05: 7/8 done in 0.015 minutes
0.05: 8/8 done in 0.014 minutes
0.1: 1/8 done in 0.037 minutes
0.1: 2/8 done in 0.038 minutes
0.1: 3/8 done in 0.039 minutes
0.1: 4/8 done in 0.039 minutes
0.1: 5/8 done in 0.039 minutes
0.1: 6/8 done in 0.039 minutes
0.1: 7/8 done in 0.038 minutes
0.1: 8/8 done in 0.037 minutes
0.2: 1/8 done in 0.11 minutes
0.2: 2/8 done in 0.115 minutes
0.2: 3/8 done in 0.108 minutes
0.2: 4/8 done in 0.11 minutes
0.2: 5/8 done in 0.111 minutes
0.2: 6/8 done in 0.11 minutes
0.2: 7/8 done in 0.106 minutes
0.2: 8/8 done in 0.104 minutes
0.3: 1/8 done in 0.202 minutes
0.3: 2/8 done in 0.213 minutes
0.3: 3/8 done in 0.213 minutes
0.3: 4/8 done in 0.21 minutes
0.3: 5/8 done in 0.207 minutes
0.3: 6/8 done in 0.211 minutes
0.3: 7/8 done in 0.202 minutes
0.3: 8/8 done in 0.205 minutes
0.4:

### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
scores_matrix_levenshtein5_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.026 minutes
0.05: 2/8 done in 0.026 minutes
0.05: 3/8 done in 0.028 minutes
0.05: 4/8 done in 0.027 minutes
0.05: 5/8 done in 0.028 minutes
0.05: 6/8 done in 0.028 minutes
0.05: 7/8 done in 0.026 minutes
0.05: 8/8 done in 0.026 minutes
0.1: 1/8 done in 0.09 minutes
0.1: 2/8 done in 0.093 minutes
0.1: 3/8 done in 0.089 minutes
0.1: 4/8 done in 0.091 minutes
0.1: 5/8 done in 0.094 minutes
0.1: 6/8 done in 0.094 minutes
0.1: 7/8 done in 0.095 minutes
0.1: 8/8 done in 0.089 minutes
0.2: 1/8 done in 0.372 minutes
0.2: 2/8 done in 0.377 minutes
0.2: 3/8 done in 0.373 minutes
0.2: 4/8 done in 0.376 minutes
0.2: 5/8 done in 0.382 minutes
0.2: 6/8 done in 0.373 minutes
0.2: 7/8 done in 0.388 minutes
0.2: 8/8 done in 0.372 minutes
0.3: 1/8 done in 0.873 minutes
0.3: 2/8 done in 0.897 minutes
0.3: 3/8 done in 0.908 minutes
0.3: 4/8 done in 0.91 minutes
0.3: 5/8 done in 0.906 minutes
0.3: 6/8 done in 0.902 minutes
0.3: 7/8 done in 0.914 minutes
0.3: 8/8 done in 0.905 minutes
0.

### Polluted Random

In [None]:
pollution_type = pollution_types[2]
scores_matrix_levenshtein5_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.004 minutes
0.05: 2/8 done in 0.006 minutes
0.05: 3/8 done in 0.006 minutes
0.05: 4/8 done in 0.004 minutes
0.05: 5/8 done in 0.004 minutes
0.05: 6/8 done in 0.004 minutes
0.05: 7/8 done in 0.004 minutes
0.05: 8/8 done in 0.004 minutes
0.1: 1/8 done in 0.004 minutes
0.1: 2/8 done in 0.004 minutes
0.1: 3/8 done in 0.004 minutes
0.1: 4/8 done in 0.004 minutes
0.1: 5/8 done in 0.004 minutes
0.1: 6/8 done in 0.004 minutes
0.1: 7/8 done in 0.004 minutes
0.1: 8/8 done in 0.004 minutes
0.2: 1/8 done in 0.005 minutes
0.2: 2/8 done in 0.005 minutes
0.2: 3/8 done in 0.005 minutes
0.2: 4/8 done in 0.004 minutes
0.2: 5/8 done in 0.005 minutes
0.2: 6/8 done in 0.005 minutes
0.2: 7/8 done in 0.004 minutes
0.2: 8/8 done in 0.005 minutes
0.3: 1/8 done in 0.005 minutes
0.3: 2/8 done in 0.005 minutes
0.3: 3/8 done in 0.005 minutes
0.3: 4/8 done in 0.005 minutes
0.3: 5/8 done in 0.005 minutes
0.3: 6/8 done in 0.005 minutes
0.3: 7/8 done in 0.005 minutes
0.3: 8/8 done in 0.005 minutes


In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{distance}-{threshold}_avg.pickle", "wb") as f:
  pickle.dump(scores_matrix_levenshtein5_avg, f)

## Dropping (0.001)

In [None]:
distance = "DROP"
technique = "DROP"
threshold = 0.001
scores_matrix_drop001_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
scores_matrix_drop001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.003 minutes
0.05: 2/8 done in 0.003 minutes
0.05: 3/8 done in 0.003 minutes
0.05: 4/8 done in 0.003 minutes
0.05: 5/8 done in 0.003 minutes
0.05: 6/8 done in 0.003 minutes
0.05: 7/8 done in 0.003 minutes
0.05: 8/8 done in 0.003 minutes
0.1: 1/8 done in 0.003 minutes
0.1: 2/8 done in 0.003 minutes
0.1: 3/8 done in 0.003 minutes
0.1: 4/8 done in 0.003 minutes
0.1: 5/8 done in 0.003 minutes
0.1: 6/8 done in 0.003 minutes
0.1: 7/8 done in 0.003 minutes
0.1: 8/8 done in 0.003 minutes
0.2: 1/8 done in 0.003 minutes
0.2: 2/8 done in 0.003 minutes
0.2: 3/8 done in 0.003 minutes
0.2: 4/8 done in 0.003 minutes
0.2: 5/8 done in 0.003 minutes
0.2: 6/8 done in 0.003 minutes
0.2: 7/8 done in 0.003 minutes
0.2: 8/8 done in 0.003 minutes
0.3: 1/8 done in 0.003 minutes
0.3: 2/8 done in 0.003 minutes
0.3: 3/8 done in 0.003 minutes
0.3: 4/8 done in 0.003 minutes
0.3: 5/8 done in 0.003 minutes
0.3: 6/8 done in 0.003 minutes
0.3: 7/8 done in 0.003 minutes
0.3: 8/8 done in 0.003 minutes


### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
scores_matrix_drop001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.003 minutes
0.05: 2/8 done in 0.003 minutes
0.05: 3/8 done in 0.003 minutes
0.05: 4/8 done in 0.003 minutes
0.05: 5/8 done in 0.003 minutes
0.05: 6/8 done in 0.003 minutes
0.05: 7/8 done in 0.003 minutes
0.05: 8/8 done in 0.003 minutes
0.1: 1/8 done in 0.003 minutes
0.1: 2/8 done in 0.003 minutes
0.1: 3/8 done in 0.003 minutes
0.1: 4/8 done in 0.003 minutes
0.1: 5/8 done in 0.003 minutes
0.1: 6/8 done in 0.003 minutes
0.1: 7/8 done in 0.003 minutes
0.1: 8/8 done in 0.003 minutes
0.2: 1/8 done in 0.003 minutes
0.2: 2/8 done in 0.003 minutes
0.2: 3/8 done in 0.003 minutes
0.2: 4/8 done in 0.003 minutes
0.2: 5/8 done in 0.003 minutes
0.2: 6/8 done in 0.003 minutes
0.2: 7/8 done in 0.003 minutes
0.2: 8/8 done in 0.003 minutes
0.3: 1/8 done in 0.003 minutes
0.3: 2/8 done in 0.003 minutes
0.3: 3/8 done in 0.003 minutes
0.3: 4/8 done in 0.003 minutes
0.3: 5/8 done in 0.003 minutes
0.3: 6/8 done in 0.003 minutes
0.3: 7/8 done in 0.003 minutes
0.3: 8/8 done in 0.003 minutes


### Polluted Random

In [None]:
pollution_type = pollution_types[2]
scores_matrix_drop001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.003 minutes
0.05: 2/8 done in 0.003 minutes
0.05: 3/8 done in 0.003 minutes
0.05: 4/8 done in 0.003 minutes
0.05: 5/8 done in 0.003 minutes
0.05: 6/8 done in 0.003 minutes
0.05: 7/8 done in 0.003 minutes
0.05: 8/8 done in 0.003 minutes
0.1: 1/8 done in 0.003 minutes
0.1: 2/8 done in 0.003 minutes
0.1: 3/8 done in 0.003 minutes
0.1: 4/8 done in 0.003 minutes
0.1: 5/8 done in 0.003 minutes
0.1: 6/8 done in 0.003 minutes
0.1: 7/8 done in 0.003 minutes
0.1: 8/8 done in 0.003 minutes
0.2: 1/8 done in 0.003 minutes
0.2: 2/8 done in 0.003 minutes
0.2: 3/8 done in 0.003 minutes
0.2: 4/8 done in 0.003 minutes
0.2: 5/8 done in 0.003 minutes
0.2: 6/8 done in 0.003 minutes
0.2: 7/8 done in 0.003 minutes
0.2: 8/8 done in 0.003 minutes
0.3: 1/8 done in 0.003 minutes
0.3: 2/8 done in 0.003 minutes
0.3: 3/8 done in 0.003 minutes
0.3: 4/8 done in 0.003 minutes
0.3: 5/8 done in 0.003 minutes
0.3: 6/8 done in 0.003 minutes
0.3: 7/8 done in 0.003 minutes
0.3: 8/8 done in 0.003 minutes


In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{technique}-{threshold}_avg.pickle", "wb") as f:
  pickle.dump(scores_matrix_drop001_avg, f)

## Dropping (0.001) (delete)

In [None]:
distance = "DROP_DELETE"
technique = "DROP_DELETE"
threshold = 0.001
scores_matrix_drop_delete001_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
scores_matrix_drop_delete001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.001 minutes
0.05: 2/8 done in 0.001 minutes
0.05: 3/8 done in 0.001 minutes
0.05: 4/8 done in 0.001 minutes
0.05: 5/8 done in 0.001 minutes
0.05: 6/8 done in 0.001 minutes
0.05: 7/8 done in 0.001 minutes
0.05: 8/8 done in 0.001 minutes
0.1: 1/8 done in 0.001 minutes
0.1: 2/8 done in 0.001 minutes
0.1: 3/8 done in 0.001 minutes
0.1: 4/8 done in 0.001 minutes
0.1: 5/8 done in 0.001 minutes
0.1: 6/8 done in 0.001 minutes
0.1: 7/8 done in 0.001 minutes
0.1: 8/8 done in 0.001 minutes
0.2: 1/8 done in 0.001 minutes
0.2: 2/8 done in 0.001 minutes
0.2: 3/8 done in 0.001 minutes
0.2: 4/8 done in 0.001 minutes
0.2: 5/8 done in 0.001 minutes
0.2: 6/8 done in 0.001 minutes
0.2: 7/8 done in 0.001 minutes
0.2: 8/8 done in 0.001 minutes
0.3: 1/8 done in 0.001 minutes
0.3: 2/8 done in 0.001 minutes
0.3: 3/8 done in 0.001 minutes
0.3: 4/8 done in 0.001 minutes
0.3: 5/8 done in 0.001 minutes
0.3: 6/8 done in 0.001 minutes
0.3: 7/8 done in 0.001 minutes
0.3: 8/8 done in 0.001 minutes


### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
scores_matrix_drop_delete001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.001 minutes
0.05: 2/8 done in 0.001 minutes
0.05: 3/8 done in 0.001 minutes
0.05: 4/8 done in 0.001 minutes
0.05: 5/8 done in 0.001 minutes
0.05: 6/8 done in 0.001 minutes
0.05: 7/8 done in 0.001 minutes
0.05: 8/8 done in 0.001 minutes
0.1: 1/8 done in 0.001 minutes
0.1: 2/8 done in 0.001 minutes
0.1: 3/8 done in 0.001 minutes
0.1: 4/8 done in 0.001 minutes
0.1: 5/8 done in 0.001 minutes
0.1: 6/8 done in 0.001 minutes
0.1: 7/8 done in 0.001 minutes
0.1: 8/8 done in 0.001 minutes
0.2: 1/8 done in 0.001 minutes
0.2: 2/8 done in 0.001 minutes
0.2: 3/8 done in 0.001 minutes
0.2: 4/8 done in 0.001 minutes
0.2: 5/8 done in 0.001 minutes
0.2: 6/8 done in 0.001 minutes
0.2: 7/8 done in 0.001 minutes
0.2: 8/8 done in 0.001 minutes
0.3: 1/8 done in 0.001 minutes
0.3: 2/8 done in 0.001 minutes
0.3: 3/8 done in 0.001 minutes
0.3: 4/8 done in 0.001 minutes
0.3: 5/8 done in 0.001 minutes
0.3: 6/8 done in 0.001 minutes
0.3: 7/8 done in 0.001 minutes
0.3: 8/8 done in 0.001 minutes


### Polluted Random

In [None]:
pollution_type = pollution_types[2]
scores_matrix_drop_delete001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold)

0.05: 1/8 done in 0.001 minutes
0.05: 2/8 done in 0.001 minutes
0.05: 3/8 done in 0.001 minutes
0.05: 4/8 done in 0.001 minutes
0.05: 5/8 done in 0.001 minutes
0.05: 6/8 done in 0.001 minutes
0.05: 7/8 done in 0.001 minutes
0.05: 8/8 done in 0.001 minutes
0.1: 1/8 done in 0.001 minutes
0.1: 2/8 done in 0.001 minutes
0.1: 3/8 done in 0.001 minutes
0.1: 4/8 done in 0.001 minutes
0.1: 5/8 done in 0.001 minutes
0.1: 6/8 done in 0.001 minutes
0.1: 7/8 done in 0.001 minutes
0.1: 8/8 done in 0.001 minutes
0.2: 1/8 done in 0.001 minutes
0.2: 2/8 done in 0.001 minutes
0.2: 3/8 done in 0.001 minutes
0.2: 4/8 done in 0.001 minutes
0.2: 5/8 done in 0.001 minutes
0.2: 6/8 done in 0.001 minutes
0.2: 7/8 done in 0.001 minutes
0.2: 8/8 done in 0.001 minutes
0.3: 1/8 done in 0.001 minutes
0.3: 2/8 done in 0.001 minutes
0.3: 3/8 done in 0.001 minutes
0.3: 4/8 done in 0.001 minutes
0.3: 5/8 done in 0.001 minutes
0.3: 6/8 done in 0.001 minutes
0.3: 7/8 done in 0.001 minutes
0.3: 8/8 done in 0.001 minutes


In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{technique}-{threshold}_avg.pickle", "wb") as f:
  pickle.dump(scores_matrix_drop_delete001_avg, f)

# BPIC '15

## Overview

### Defines

In [None]:
dataset = "BPIC15_1_f2"

In [None]:
directory_path = f'./{dataset}/{dataset}/{dataset}_prepared/'
files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [None]:
files.remove(f'{dataset}-TRAIN-CLEAN.csv')

In [None]:
df_ref = pd.read_csv(f'./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv')
ref = df_ref.CCDOEV

### Baseline

In [None]:
# Compute DQ metrics for all corrupted datasets, to use as a baseline
_dq_dict = {}
for pollution_type in pollution_types:
  _dq_dict[pollution_type] = {}
  for percentage in percentages:
    _dq_dict[pollution_type][percentage] = {}
    for i in range(8):
      df = pd.read_csv(f"{directory_path}{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      labels = df.CCDOEV
      _dq_dict[pollution_type][percentage][i] = DQ_assess(df, dq) | dict(zip(dq_scorings, conf_matrix(ref, labels)))

In [None]:
# Average the metrics
_dq_dict_avg = {}
for pollution_type in pollution_types:
  _dq_dict_avg[pollution_type] = {}
  for percentage in percentages:
    _dq_dict_avg[pollution_type][percentage] = {}
    for metric in metrics:
      _dq_dict_avg[pollution_type][percentage][metric] = np.mean([_dq_dict[pollution_type][percentage][i][metric] for i in range(8)])

scorings = DQ_assess(df_ref, dq) | dict(zip(dq_scorings, conf_matrix(ref, ref)))

_dq_dict_avg["CLEAN"] = {percentage:scorings for percentage in percentages}

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}-REF_scores_matrix_avg.pickle", "wb") as f:
  pickle.dump(_dq_dict_avg, f)

## Levenshtein clustering (d=1)

In [None]:
distance = "levenshtein"
technique = "DBSCAN"
threshold = 1
_scores_matrix_levenshtein1_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
_scores_matrix_levenshtein1_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.023 minutes
0.05: 2/8 done in 0.023 minutes
0.05: 3/8 done in 0.022 minutes
0.05: 4/8 done in 0.022 minutes
0.05: 5/8 done in 0.022 minutes
0.05: 6/8 done in 0.023 minutes
0.05: 7/8 done in 0.022 minutes
0.05: 8/8 done in 0.022 minutes
0.1: 1/8 done in 0.061 minutes
0.1: 2/8 done in 0.059 minutes
0.1: 3/8 done in 0.058 minutes
0.1: 4/8 done in 0.057 minutes
0.1: 5/8 done in 0.06 minutes
0.1: 6/8 done in 0.064 minutes
0.1: 7/8 done in 0.056 minutes
0.1: 8/8 done in 0.059 minutes
0.2: 1/8 done in 0.182 minutes
0.2: 2/8 done in 0.179 minutes
0.2: 3/8 done in 0.177 minutes
0.2: 4/8 done in 0.181 minutes
0.2: 5/8 done in 0.179 minutes
0.2: 6/8 done in 0.179 minutes
0.2: 7/8 done in 0.178 minutes
0.2: 8/8 done in 0.179 minutes
0.3: 1/8 done in 0.362 minutes
0.3: 2/8 done in 0.357 minutes
0.3: 3/8 done in 0.354 minutes
0.3: 4/8 done in 0.355 minutes
0.3: 5/8 done in 0.354 minutes
0.3: 6/8 done in 0.357 minutes
0.3: 7/8 done in 0.348 minutes
0.3: 8/8 done in 0.345 minutes
0

### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
_scores_matrix_levenshtein1_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.009 minutes
0.05: 2/8 done in 0.009 minutes
0.05: 3/8 done in 0.009 minutes
0.05: 4/8 done in 0.009 minutes
0.05: 5/8 done in 0.009 minutes
0.05: 6/8 done in 0.009 minutes
0.05: 7/8 done in 0.009 minutes
0.05: 8/8 done in 0.01 minutes
0.1: 1/8 done in 0.013 minutes
0.1: 2/8 done in 0.013 minutes
0.1: 3/8 done in 0.012 minutes
0.1: 4/8 done in 0.012 minutes
0.1: 5/8 done in 0.014 minutes
0.1: 6/8 done in 0.013 minutes
0.1: 7/8 done in 0.012 minutes
0.1: 8/8 done in 0.012 minutes
0.2: 1/8 done in 0.018 minutes
0.2: 2/8 done in 0.018 minutes
0.2: 3/8 done in 0.017 minutes
0.2: 4/8 done in 0.017 minutes
0.2: 5/8 done in 0.019 minutes
0.2: 6/8 done in 0.018 minutes
0.2: 7/8 done in 0.017 minutes
0.2: 8/8 done in 0.017 minutes
0.3: 1/8 done in 0.022 minutes
0.3: 2/8 done in 0.023 minutes
0.3: 3/8 done in 0.022 minutes
0.3: 4/8 done in 0.023 minutes
0.3: 5/8 done in 0.022 minutes
0.3: 6/8 done in 0.022 minutes
0.3: 7/8 done in 0.022 minutes
0.3: 8/8 done in 0.021 minutes
0

### Polluted Random

In [None]:
pollution_type = pollution_types[2]
_scores_matrix_levenshtein1_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.035 minutes
0.05: 2/8 done in 0.035 minutes
0.05: 3/8 done in 0.035 minutes
0.05: 4/8 done in 0.035 minutes
0.05: 5/8 done in 0.034 minutes
0.05: 6/8 done in 0.035 minutes
0.05: 7/8 done in 0.035 minutes
0.05: 8/8 done in 0.038 minutes
0.1: 1/8 done in 0.129 minutes
0.1: 2/8 done in 0.129 minutes
0.1: 3/8 done in 0.124 minutes
0.1: 4/8 done in 0.127 minutes
0.1: 5/8 done in 0.117 minutes
0.1: 6/8 done in 0.122 minutes
0.1: 7/8 done in 0.117 minutes
0.1: 8/8 done in 0.117 minutes
0.2: 1/8 done in 0.502 minutes
0.2: 2/8 done in 0.488 minutes
0.2: 3/8 done in 0.475 minutes
0.2: 4/8 done in 0.476 minutes
0.2: 5/8 done in 0.476 minutes
0.2: 6/8 done in 0.48 minutes
0.2: 7/8 done in 0.491 minutes
0.2: 8/8 done in 0.49 minutes
0.3: 1/8 done in 1.128 minutes
0.3: 2/8 done in 1.103 minutes
0.3: 3/8 done in 1.179 minutes
0.3: 4/8 done in 1.154 minutes
0.3: 5/8 done in 1.142 minutes
0.3: 6/8 done in 1.135 minutes
0.3: 7/8 done in 1.182 minutes
0.3: 8/8 done in 1.201 minutes
0.

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{distance}-{d}_avg.pickle", "wb") as f:
  pickle.dump(_scores_matrix_levenshtein1_avg, f)

## Levenshtein clustering (d=5)

In [None]:
distance = "levenshtein"
technique = "DBSCAN"
threshold = 5
_scores_matrix_levenshtein5_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
_scores_matrix_levenshtein5_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.025 minutes
0.05: 2/8 done in 0.022 minutes
0.05: 3/8 done in 0.022 minutes
0.05: 4/8 done in 0.023 minutes
0.05: 5/8 done in 0.022 minutes
0.05: 6/8 done in 0.022 minutes
0.05: 7/8 done in 0.022 minutes
0.05: 8/8 done in 0.022 minutes
0.1: 1/8 done in 0.062 minutes
0.1: 2/8 done in 0.061 minutes
0.1: 3/8 done in 0.058 minutes
0.1: 4/8 done in 0.06 minutes
0.1: 5/8 done in 0.058 minutes
0.1: 6/8 done in 0.062 minutes
0.1: 7/8 done in 0.059 minutes
0.1: 8/8 done in 0.06 minutes
0.2: 1/8 done in 0.19 minutes
0.2: 2/8 done in 0.19 minutes
0.2: 3/8 done in 0.182 minutes
0.2: 4/8 done in 0.183 minutes
0.2: 5/8 done in 0.187 minutes
0.2: 6/8 done in 0.182 minutes
0.2: 7/8 done in 0.182 minutes
0.2: 8/8 done in 0.182 minutes
0.3: 1/8 done in 0.367 minutes
0.3: 2/8 done in 0.368 minutes
0.3: 3/8 done in 0.361 minutes
0.3: 4/8 done in 0.375 minutes
0.3: 5/8 done in 0.405 minutes
0.3: 6/8 done in 0.373 minutes
0.3: 7/8 done in 0.36 minutes
0.3: 8/8 done in 0.357 minutes
0.4: 

### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
_scores_matrix_levenshtein5_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.009 minutes
0.05: 2/8 done in 0.009 minutes
0.05: 3/8 done in 0.009 minutes
0.05: 4/8 done in 0.01 minutes
0.05: 5/8 done in 0.009 minutes
0.05: 6/8 done in 0.009 minutes
0.05: 7/8 done in 0.009 minutes
0.05: 8/8 done in 0.009 minutes
0.1: 1/8 done in 0.013 minutes
0.1: 2/8 done in 0.013 minutes
0.1: 3/8 done in 0.012 minutes
0.1: 4/8 done in 0.012 minutes
0.1: 5/8 done in 0.013 minutes
0.1: 6/8 done in 0.012 minutes
0.1: 7/8 done in 0.012 minutes
0.1: 8/8 done in 0.012 minutes
0.2: 1/8 done in 0.017 minutes
0.2: 2/8 done in 0.017 minutes
0.2: 3/8 done in 0.016 minutes
0.2: 4/8 done in 0.016 minutes
0.2: 5/8 done in 0.017 minutes
0.2: 6/8 done in 0.017 minutes
0.2: 7/8 done in 0.017 minutes
0.2: 8/8 done in 0.016 minutes
0.3: 1/8 done in 0.021 minutes
0.3: 2/8 done in 0.021 minutes
0.3: 3/8 done in 0.021 minutes
0.3: 4/8 done in 0.02 minutes
0.3: 5/8 done in 0.02 minutes
0.3: 6/8 done in 0.021 minutes
0.3: 7/8 done in 0.021 minutes
0.3: 8/8 done in 0.02 minutes
0.4:

### Polluted Random

In [None]:
pollution_type = pollution_types[2]
_scores_matrix_levenshtein5_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.036 minutes
0.05: 2/8 done in 0.036 minutes
0.05: 3/8 done in 0.037 minutes
0.05: 4/8 done in 0.037 minutes
0.05: 5/8 done in 0.037 minutes
0.05: 6/8 done in 0.037 minutes
0.05: 7/8 done in 0.037 minutes
0.05: 8/8 done in 0.037 minutes
0.1: 1/8 done in 0.126 minutes
0.1: 2/8 done in 0.129 minutes
0.1: 3/8 done in 0.128 minutes
0.1: 4/8 done in 0.126 minutes
0.1: 5/8 done in 0.129 minutes
0.1: 6/8 done in 0.124 minutes
0.1: 7/8 done in 0.143 minutes
0.1: 8/8 done in 0.133 minutes
0.2: 1/8 done in 0.553 minutes
0.2: 2/8 done in 0.543 minutes
0.2: 3/8 done in 0.517 minutes
0.2: 4/8 done in 0.537 minutes
0.2: 5/8 done in 0.51 minutes
0.2: 6/8 done in 0.567 minutes
0.2: 7/8 done in 0.556 minutes
0.2: 8/8 done in 0.544 minutes
0.3: 1/8 done in 1.301 minutes
0.3: 2/8 done in 1.218 minutes
0.3: 3/8 done in 1.201 minutes
0.3: 4/8 done in 1.289 minutes
0.3: 5/8 done in 1.298 minutes
0.3: 6/8 done in 1.228 minutes
0.3: 7/8 done in 1.198 minutes
0.3: 8/8 done in 1.16 minutes
0.

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{distance}-{threshold}_avg.pickle", "wb") as f:
  pickle.dump(_scores_matrix_levenshtein5_avg, f)

## Dropping (0.001)

In [None]:
distance = "DROP"
technique = "DROP"
threshold = 0.001
_scores_matrix_drop001_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
_scores_matrix_drop001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.004 minutes
0.05: 2/8 done in 0.004 minutes
0.05: 3/8 done in 0.004 minutes
0.05: 4/8 done in 0.004 minutes
0.05: 5/8 done in 0.004 minutes
0.05: 6/8 done in 0.004 minutes
0.05: 7/8 done in 0.004 minutes
0.05: 8/8 done in 0.004 minutes
0.1: 1/8 done in 0.004 minutes
0.1: 2/8 done in 0.004 minutes
0.1: 3/8 done in 0.004 minutes
0.1: 4/8 done in 0.004 minutes
0.1: 5/8 done in 0.004 minutes
0.1: 6/8 done in 0.004 minutes
0.1: 7/8 done in 0.004 minutes
0.1: 8/8 done in 0.004 minutes
0.2: 1/8 done in 0.004 minutes
0.2: 2/8 done in 0.004 minutes
0.2: 3/8 done in 0.004 minutes
0.2: 4/8 done in 0.004 minutes
0.2: 5/8 done in 0.004 minutes
0.2: 6/8 done in 0.004 minutes
0.2: 7/8 done in 0.004 minutes
0.2: 8/8 done in 0.004 minutes
0.3: 1/8 done in 0.004 minutes
0.3: 2/8 done in 0.004 minutes
0.3: 3/8 done in 0.004 minutes
0.3: 4/8 done in 0.004 minutes
0.3: 5/8 done in 0.004 minutes
0.3: 6/8 done in 0.004 minutes
0.3: 7/8 done in 0.004 minutes
0.3: 8/8 done in 0.004 minutes


### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
_scores_matrix_drop001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.003 minutes
0.05: 2/8 done in 0.003 minutes
0.05: 3/8 done in 0.004 minutes
0.05: 4/8 done in 0.003 minutes
0.05: 5/8 done in 0.003 minutes
0.05: 6/8 done in 0.004 minutes
0.05: 7/8 done in 0.004 minutes
0.05: 8/8 done in 0.004 minutes
0.1: 1/8 done in 0.004 minutes
0.1: 2/8 done in 0.003 minutes
0.1: 3/8 done in 0.004 minutes
0.1: 4/8 done in 0.003 minutes
0.1: 5/8 done in 0.004 minutes
0.1: 6/8 done in 0.003 minutes
0.1: 7/8 done in 0.004 minutes
0.1: 8/8 done in 0.003 minutes
0.2: 1/8 done in 0.004 minutes
0.2: 2/8 done in 0.004 minutes
0.2: 3/8 done in 0.004 minutes
0.2: 4/8 done in 0.003 minutes
0.2: 5/8 done in 0.004 minutes
0.2: 6/8 done in 0.004 minutes
0.2: 7/8 done in 0.004 minutes
0.2: 8/8 done in 0.004 minutes
0.3: 1/8 done in 0.004 minutes
0.3: 2/8 done in 0.004 minutes
0.3: 3/8 done in 0.004 minutes
0.3: 4/8 done in 0.004 minutes
0.3: 5/8 done in 0.004 minutes
0.3: 6/8 done in 0.004 minutes
0.3: 7/8 done in 0.004 minutes
0.3: 8/8 done in 0.004 minutes


### Polluted Random

In [None]:
pollution_type = pollution_types[2]
_scores_matrix_drop001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.004 minutes
0.05: 2/8 done in 0.004 minutes
0.05: 3/8 done in 0.004 minutes
0.05: 4/8 done in 0.004 minutes
0.05: 5/8 done in 0.004 minutes
0.05: 6/8 done in 0.004 minutes
0.05: 7/8 done in 0.004 minutes
0.05: 8/8 done in 0.004 minutes
0.1: 1/8 done in 0.004 minutes
0.1: 2/8 done in 0.004 minutes
0.1: 3/8 done in 0.004 minutes
0.1: 4/8 done in 0.004 minutes
0.1: 5/8 done in 0.004 minutes
0.1: 6/8 done in 0.004 minutes
0.1: 7/8 done in 0.004 minutes
0.1: 8/8 done in 0.004 minutes
0.2: 1/8 done in 0.004 minutes
0.2: 2/8 done in 0.004 minutes
0.2: 3/8 done in 0.004 minutes
0.2: 4/8 done in 0.004 minutes
0.2: 5/8 done in 0.004 minutes
0.2: 6/8 done in 0.004 minutes
0.2: 7/8 done in 0.004 minutes
0.2: 8/8 done in 0.004 minutes
0.3: 1/8 done in 0.004 minutes
0.3: 2/8 done in 0.004 minutes
0.3: 3/8 done in 0.004 minutes
0.3: 4/8 done in 0.004 minutes
0.3: 5/8 done in 0.004 minutes
0.3: 6/8 done in 0.004 minutes
0.3: 7/8 done in 0.004 minutes
0.3: 8/8 done in 0.004 minutes


In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{distance}-{threshold}_avg.pickle", "wb") as f:
  pickle.dump(_scores_matrix_drop001_avg, f)

## Dropping (0.001) (delete)

In [None]:
distance = "DROP_DELETE"
technique = "DROP_DELETE"
threshold = 0.001
_scores_matrix_drop_delete001_avg = {}

### Distorted

In [None]:
pollution_type = pollution_types[0]
_scores_matrix_drop_delete001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.001 minutes
0.05: 2/8 done in 0.001 minutes
0.05: 3/8 done in 0.001 minutes
0.05: 4/8 done in 0.001 minutes
0.05: 5/8 done in 0.001 minutes
0.05: 6/8 done in 0.001 minutes
0.05: 7/8 done in 0.001 minutes
0.05: 8/8 done in 0.001 minutes
0.1: 1/8 done in 0.001 minutes
0.1: 2/8 done in 0.001 minutes
0.1: 3/8 done in 0.001 minutes
0.1: 4/8 done in 0.001 minutes
0.1: 5/8 done in 0.001 minutes
0.1: 6/8 done in 0.001 minutes
0.1: 7/8 done in 0.001 minutes
0.1: 8/8 done in 0.001 minutes
0.2: 1/8 done in 0.001 minutes
0.2: 2/8 done in 0.001 minutes
0.2: 3/8 done in 0.001 minutes
0.2: 4/8 done in 0.001 minutes
0.2: 5/8 done in 0.001 minutes
0.2: 6/8 done in 0.001 minutes
0.2: 7/8 done in 0.001 minutes
0.2: 8/8 done in 0.001 minutes
0.3: 1/8 done in 0.001 minutes
0.3: 2/8 done in 0.001 minutes
0.3: 3/8 done in 0.001 minutes
0.3: 4/8 done in 0.001 minutes
0.3: 5/8 done in 0.001 minutes
0.3: 6/8 done in 0.001 minutes
0.3: 7/8 done in 0.001 minutes
0.3: 8/8 done in 0.001 minutes


### Polluted Nonrandom

In [None]:
pollution_type = pollution_types[1]
_scores_matrix_drop_delete001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.001 minutes
0.05: 2/8 done in 0.001 minutes
0.05: 3/8 done in 0.001 minutes
0.05: 4/8 done in 0.001 minutes
0.05: 5/8 done in 0.001 minutes
0.05: 6/8 done in 0.001 minutes
0.05: 7/8 done in 0.001 minutes
0.05: 8/8 done in 0.001 minutes
0.1: 1/8 done in 0.001 minutes
0.1: 2/8 done in 0.001 minutes
0.1: 3/8 done in 0.001 minutes
0.1: 4/8 done in 0.001 minutes
0.1: 5/8 done in 0.001 minutes
0.1: 6/8 done in 0.001 minutes
0.1: 7/8 done in 0.001 minutes
0.1: 8/8 done in 0.001 minutes
0.2: 1/8 done in 0.001 minutes
0.2: 2/8 done in 0.001 minutes
0.2: 3/8 done in 0.001 minutes
0.2: 4/8 done in 0.001 minutes
0.2: 5/8 done in 0.001 minutes
0.2: 6/8 done in 0.001 minutes
0.2: 7/8 done in 0.001 minutes
0.2: 8/8 done in 0.001 minutes
0.3: 1/8 done in 0.001 minutes
0.3: 2/8 done in 0.001 minutes
0.3: 3/8 done in 0.001 minutes
0.3: 4/8 done in 0.001 minutes
0.3: 5/8 done in 0.001 minutes
0.3: 6/8 done in 0.001 minutes
0.3: 7/8 done in 0.001 minutes
0.3: 8/8 done in 0.001 minutes


### Polluted Random

In [None]:
pollution_type = pollution_types[2]
_scores_matrix_drop_delete001_avg[pollution_type] = cleaning(pollution_type, ref, technique, distance, threshold, dataset=dataset)

0.05: 1/8 done in 0.001 minutes
0.05: 2/8 done in 0.001 minutes
0.05: 3/8 done in 0.001 minutes
0.05: 4/8 done in 0.001 minutes
0.05: 5/8 done in 0.001 minutes
0.05: 6/8 done in 0.001 minutes
0.05: 7/8 done in 0.001 minutes
0.05: 8/8 done in 0.001 minutes
0.1: 1/8 done in 0.001 minutes
0.1: 2/8 done in 0.001 minutes
0.1: 3/8 done in 0.001 minutes
0.1: 4/8 done in 0.001 minutes
0.1: 5/8 done in 0.001 minutes
0.1: 6/8 done in 0.001 minutes
0.1: 7/8 done in 0.001 minutes
0.1: 8/8 done in 0.001 minutes
0.2: 1/8 done in 0.001 minutes
0.2: 2/8 done in 0.001 minutes
0.2: 3/8 done in 0.001 minutes
0.2: 4/8 done in 0.001 minutes
0.2: 5/8 done in 0.001 minutes
0.2: 6/8 done in 0.001 minutes
0.2: 7/8 done in 0.001 minutes
0.2: 8/8 done in 0.001 minutes
0.3: 1/8 done in 0.001 minutes
0.3: 2/8 done in 0.001 minutes
0.3: 3/8 done in 0.001 minutes
0.3: 4/8 done in 0.001 minutes
0.3: 5/8 done in 0.001 minutes
0.3: 6/8 done in 0.001 minutes
0.3: 7/8 done in 0.001 minutes
0.3: 8/8 done in 0.001 minutes


In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_{technique}-{threshold}_avg.pickle", "wb") as f:
  pickle.dump(_scores_matrix_drop_delete001_avg, f)

# Credit (WIP)

In [None]:
pollution_types = ["DISTORTED-activity", "POLLUTED.NORND-activity", "POLLUTED.RANDOM-activity", "SYNONYM", "HOMONYM"]

## Overview

### Defines

In [None]:
dataset = "Credit"

In [None]:
directory_path = f'./{dataset}/{dataset}/{dataset}_prepared/'
files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [None]:
files.remove(f'{dataset}-TRAIN-CLEAN.csv')

In [None]:
df_ref = pd.read_csv(f'./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv')
ref = df_ref.CCDOEV

### Baseline

In [None]:
# Compute DQ metrics for all corrupted datasets, to use as a baseline
dq_dict = {}
for pollution_type in pollution_types:
  dq_dict[pollution_type] = {}
  for percentage in percentages:
    dq_dict[pollution_type][percentage] = {}
    for i in range(8):
      df = pd.read_csv(f"{directory_path}{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      labels = df.CCDOEV
      dq_dict[pollution_type][percentage][i] = DQ_assess(df, dq)| dict(zip(dq_scorings, conf_matrix(ref, labels)))

In [None]:
# Average the metrics
dq_dict_avg = {}
for pollution_type in pollution_types:
  dq_dict_avg[pollution_type] = {}
  for percentage in percentages:
    dq_dict_avg[pollution_type][percentage] = {}
    for metric in metrics:
      dq_dict_avg[pollution_type][percentage][metric] = np.mean([dq_dict[pollution_type][percentage][i][metric] for i in range(8)])

scorings = DQ_assess(df_ref, dq) | dict(zip(dq_scorings, conf_matrix(ref, ref)))

dq_dict_avg["CLEAN"] = {percentage:scorings for percentage in percentages}

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}-REF_scores_matrix_avg.pickle", "wb") as f:
  pickle.dump(dq_dict_avg, f)

## Patterns comparisons

### DISTORTED

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_PATTERNER_DISTORTED_avg.pickle", "wb") as f:
  pickle.dump(cleaning_patterns("DISTORTED-activity", ref, dataset=dataset), f)

0.05: 1/8 done in 0.091 minutes
0.05: 2/8 done in 0.094 minutes
0.05: 3/8 done in 0.076 minutes
0.05: 4/8 done in 0.075 minutes
0.05: 5/8 done in 0.074 minutes
0.05: 6/8 done in 0.073 minutes
0.05: 7/8 done in 0.072 minutes
0.05: 8/8 done in 0.075 minutes
0.1: 1/8 done in 0.121 minutes
0.1: 2/8 done in 0.121 minutes
0.1: 3/8 done in 0.128 minutes
0.1: 4/8 done in 0.126 minutes
0.1: 5/8 done in 0.122 minutes
0.1: 6/8 done in 0.121 minutes
0.1: 7/8 done in 0.121 minutes
0.1: 8/8 done in 0.12 minutes
0.2: 1/8 done in 0.245 minutes
0.2: 2/8 done in 0.245 minutes
0.2: 3/8 done in 0.254 minutes
0.2: 4/8 done in 0.25 minutes
0.2: 5/8 done in 0.251 minutes
0.2: 6/8 done in 0.249 minutes
0.2: 7/8 done in 0.255 minutes
0.2: 8/8 done in 0.24 minutes
0.3: 1/8 done in 0.352 minutes
0.3: 2/8 done in 0.356 minutes
0.3: 3/8 done in 0.359 minutes
0.3: 4/8 done in 0.377 minutes
0.3: 5/8 done in 0.354 minutes
0.3: 6/8 done in 0.369 minutes
0.3: 7/8 done in 0.374 minutes
0.3: 8/8 done in 0.361 minutes
0.4

### SYNONYMOUS

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_PATTERNER_SYNONYM_avg.pickle", "wb") as f:
  pickle.dump(cleaning_patterns("SYNONYM", ref, dataset=dataset), f)

0.05: 1/8 done in 0.016 minutes
0.05: 2/8 done in 0.016 minutes
0.05: 3/8 done in 0.016 minutes
0.05: 4/8 done in 0.016 minutes
0.05: 5/8 done in 0.016 minutes
0.05: 6/8 done in 0.016 minutes
0.05: 7/8 done in 0.016 minutes
0.05: 8/8 done in 0.016 minutes
0.1: 1/8 done in 0.017 minutes
0.1: 2/8 done in 0.017 minutes
0.1: 3/8 done in 0.017 minutes
0.1: 4/8 done in 0.017 minutes
0.1: 5/8 done in 0.017 minutes
0.1: 6/8 done in 0.017 minutes
0.1: 7/8 done in 0.017 minutes
0.1: 8/8 done in 0.017 minutes
0.2: 1/8 done in 0.019 minutes
0.2: 2/8 done in 0.018 minutes
0.2: 3/8 done in 0.019 minutes
0.2: 4/8 done in 0.019 minutes
0.2: 5/8 done in 0.018 minutes
0.2: 6/8 done in 0.018 minutes
0.2: 7/8 done in 0.019 minutes
0.2: 8/8 done in 0.019 minutes
0.3: 1/8 done in 0.019 minutes
0.3: 2/8 done in 0.019 minutes
0.3: 3/8 done in 0.019 minutes
0.3: 4/8 done in 0.019 minutes
0.3: 5/8 done in 0.019 minutes
0.3: 6/8 done in 0.019 minutes
0.3: 7/8 done in 0.019 minutes
0.3: 8/8 done in 0.019 minutes


###HOMONYMOUS

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_PATTERNER_HOMONYM_avg.pickle", "wb") as f:
  pickle.dump(cleaning_patterns("HOMONYM", ref, dataset=dataset), f)

0.05: 1/8 done in 0.013 minutes
0.05: 2/8 done in 0.013 minutes
0.05: 3/8 done in 0.013 minutes
0.05: 4/8 done in 0.013 minutes
0.05: 5/8 done in 0.013 minutes
0.05: 6/8 done in 0.013 minutes
0.05: 7/8 done in 0.013 minutes
0.05: 8/8 done in 0.013 minutes
0.1: 1/8 done in 0.013 minutes
0.1: 2/8 done in 0.013 minutes
0.1: 3/8 done in 0.013 minutes
0.1: 4/8 done in 0.013 minutes
0.1: 5/8 done in 0.013 minutes
0.1: 6/8 done in 0.013 minutes
0.1: 7/8 done in 0.013 minutes
0.1: 8/8 done in 0.013 minutes
0.2: 1/8 done in 0.013 minutes
0.2: 2/8 done in 0.013 minutes
0.2: 3/8 done in 0.013 minutes
0.2: 4/8 done in 0.013 minutes
0.2: 5/8 done in 0.013 minutes
0.2: 6/8 done in 0.013 minutes
0.2: 7/8 done in 0.013 minutes
0.2: 8/8 done in 0.013 minutes
0.3: 1/8 done in 0.013 minutes
0.3: 2/8 done in 0.013 minutes
0.3: 3/8 done in 0.013 minutes
0.3: 4/8 done in 0.013 minutes
0.3: 5/8 done in 0.013 minutes
0.3: 6/8 done in 0.014 minutes
0.3: 7/8 done in 0.013 minutes
0.3: 8/8 done in 0.013 minutes


# Pub (WIP)

In [None]:
pollution_types = ["SYNONYM"]

## Overview

### Defines

In [None]:
dataset = "Pub"

In [None]:
directory_path = f'./{dataset}/{dataset}/{dataset}_prepared/'
files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [None]:
files.remove(f'{dataset}-TRAIN-CLEAN.csv')

In [None]:
df_ref = pd.read_csv(f'./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv')
ref = df_ref.CCDOEV

### Baseline

In [None]:
# Compute DQ metrics for all corrupted datasets, to use as a baseline
dq_dict = {}
for pollution_type in pollution_types:
  dq_dict[pollution_type] = {}
  for percentage in percentages:
    dq_dict[pollution_type][percentage] = {}
    for i in range(8):
      df = pd.read_csv(f"{directory_path}{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      labels = df.CCDOEV
      dq_dict[pollution_type][percentage][i] = DQ_assess(df, dq)| dict(zip(dq_scorings, conf_matrix(ref, labels)))

In [None]:
# Average the metrics
dq_dict_avg = {}
for pollution_type in pollution_types:
  dq_dict_avg[pollution_type] = {}
  for percentage in percentages:
    dq_dict_avg[pollution_type][percentage] = {}
    for metric in metrics:
      dq_dict_avg[pollution_type][percentage][metric] = np.mean([dq_dict[pollution_type][percentage][i][metric] for i in range(8)])

scorings = DQ_assess(df_ref, dq) | dict(zip(dq_scorings, conf_matrix(ref, ref)))

dq_dict_avg["CLEAN"] = {percentage:scorings for percentage in percentages}

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}-REF_scores_matrix_avg.pickle", "wb") as f:
  pickle.dump(dq_dict_avg, f)

## Patterns comparisons

### SYNONYMOUS

In [None]:
with open(f"./{dataset}/{dataset}/{dataset}_scores_matrix_PATTERNER_SYNONYM_avg.pickle", "wb") as f:
  pickle.dump(cleaning_patterns("SYNONYM", ref, dataset=dataset), f)

0.05: 1/8 done in 0.016 minutes
0.05: 2/8 done in 0.016 minutes
0.05: 3/8 done in 0.016 minutes
0.05: 4/8 done in 0.016 minutes
0.05: 5/8 done in 0.016 minutes
0.05: 6/8 done in 0.017 minutes
0.05: 7/8 done in 0.015 minutes
0.05: 8/8 done in 0.015 minutes
0.1: 1/8 done in 0.016 minutes
0.1: 2/8 done in 0.017 minutes
0.1: 3/8 done in 0.017 minutes
0.1: 4/8 done in 0.017 minutes
0.1: 5/8 done in 0.016 minutes
0.1: 6/8 done in 0.016 minutes
0.1: 7/8 done in 0.017 minutes
0.1: 8/8 done in 0.017 minutes
0.2: 1/8 done in 0.018 minutes
0.2: 2/8 done in 0.018 minutes
0.2: 3/8 done in 0.018 minutes
0.2: 4/8 done in 0.018 minutes
0.2: 5/8 done in 0.018 minutes
0.2: 6/8 done in 0.018 minutes
0.2: 7/8 done in 0.018 minutes
0.2: 8/8 done in 0.019 minutes
0.3: 1/8 done in 0.029 minutes
0.3: 2/8 done in 0.027 minutes
0.3: 3/8 done in 0.027 minutes
0.3: 4/8 done in 0.029 minutes
0.3: 5/8 done in 0.027 minutes
0.3: 6/8 done in 0.027 minutes
0.3: 7/8 done in 0.028 minutes
0.3: 8/8 done in 0.029 minutes
