This notebook contains the code for a pair of cleaning techniques intended to improve the effectiveness of next activity prediction on polluted datasets.

# Setup

##Imports

In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import time
import statistics
import pickle

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.spatial import distance as scipy_distance

from sklearn.cluster import HDBSCAN, AgglomerativeClustering
from sklearn.metrics import pairwise_distances
!pip install Levenshtein
import Levenshtein as lev
from collections import Counter

!pip install fasttext
import fasttext
import fasttext.util
import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet

#fasttext.util.download_model('en', if_exists='ignore')
nlp_model = fasttext.load_model('cc.en.300.bin')

%matplotlib inline

## Globals

In [None]:
pollution_types_1 = ["DISTORTED", "POLLUTED.NORND", "POLLUTED.RANDOM"]
pollution_types_2 = ["SYNONYM", "DISTORTED-activity", "POLLUTED.NORND-activity", "POLLUTED.RANDOM-activity"]
pollution_types_3 = ["SYNONYM", "HOMONYM", "DISTORTED-activity", "POLLUTED.NORND-activity", "POLLUTED.RANDOM-activity"]
percentages = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
profiling_metrics = ['UNKNOWNS', 'DISTINCT', 'AVG_DISTINCT_PER_CASE', 'UNIQUENESS']
dq_scorings = ["accuracy", "precision", "recall", "f1"]
metrics = profiling_metrics+dq_scorings
metrics_time = metrics + ["time"]
limit = 2

## Functions

In [None]:
def profiling(test_set, df, subset=None):

  scores = {}

  UNKNOWNS = len(set(test_set.CCDOEV.unique())-set(df.CCDOEV.unique())) # labels found in the test set absent from the training set
  DISTINCT = df.CCDOEV.nunique()
  COUNT = df.CCDOEV.count()
  AVG_DISTINCT_PER_CASE = df.groupby('NUMPRO').CCDOEV.nunique().mean()
  UNIQUENESS = DISTINCT / COUNT

  scores['UNKNOWNS'] = UNKNOWNS
  scores['DISTINCT'] = DISTINCT
  scores['AVG_DISTINCT_PER_CASE'] = AVG_DISTINCT_PER_CASE
  scores['UNIQUENESS'] = UNIQUENESS

  return scores

In [None]:
def conf_matrix(clean, dirty):
  """
  Returns classification metrics for a list of labels compared to a reference
  """
  out = {}

  out["accuracy"] = round(accuracy_score(clean, dirty),4)
  out["precision"] = round(precision_score(clean, dirty, average='weighted', zero_division=np.nan),4)
  out["recall"] = round(recall_score(clean, dirty, average='weighted', zero_division=np.nan),4)
  out["f1"] = round(f1_score(clean, dirty, average='weighted'),4)

  return out

In [None]:
def clustering_cleaning(df, dist=lev.distance):
  """
  Performs two-step clustering on a set of labels to dieentify the correct nummber
  of clusters and then replace each label with the most common label of its cluster
  """
  temp = df.copy()

  labels = temp.CCDOEV.unique()

  distances_matrix = pairwise_distances(labels, labels, metric=dist, n_jobs=-1) # Takes the most time

  # First clustering to get the number of clusters
  hdbscan = HDBSCAN(min_cluster_size=2, metric="precomputed", n_jobs=-1)
  hdbscan.fit(distances_matrix)

  # print(len(np.unique(hdbscan.labels_)), "clusters")

  num_clusters = max(hdbscan.labels_) + 1
  if num_clusters <= 0:
    num_clusters = 1

  # Second, definitive clustering
  agglo = AgglomerativeClustering(linkage='complete', metric="precomputed", n_clusters=num_clusters)
  agglo.fit(distances_matrix)

  out = dict(zip(labels, agglo.labels_))

  temp.CCDOEV = temp.CCDOEV.map(out).map({i:Counter([label for label in temp.CCDOEV if out[label]==i]).most_common(1)[0][0] for i in np.unique(agglo.labels_)})
  return temp

def get_embedding(sw):
  """sw can be either a string or a word"""
  embedding = nlp_model.get_sentence_vector(sw)
  return embedding

def get_antonyms(word):
  """Attempts to list down antonyms of a word.
     Works sometimes"""
  antonyms = []
  for syn in wordnet.synsets(word):
      for i in syn.lemmas():
          if i.antonyms():
                antonyms.append(i.antonyms()[0].name())

  antonyms_full = antonyms.copy()
  for anto in antonyms:
    for syn in wordnet.synsets(anto):
        for lm in syn.lemmas():
            antonyms_full.append(lm.name())

  return set(antonyms_full)

def my_nlp_distance(s1, s2):
  """
  Returns semantic distance between two strings s1 and s2, with penalty for antonyms
  """
  words1 = s1.split()
  words2 = s2.split()
  antos1 = []
  antos2 = []

  penalty = 0 # temp

  for word in words1:
    temp = list(get_antonyms(word))
    antos1 += temp

  for word in words2:
    temp = list(get_antonyms(word))
    antos2 += temp

  penalty = 0.25*(len(set(words1).intersection(set(antos2)))+len(set(words2).intersection(set(antos1)))) #add a 0.25 penalty for each pair of antonyms found

  return penalty + scipy_distance.cosine(get_embedding(s1), get_embedding(s2))

def to_patterns(labels):
    """Creates a list of 3-tuples of labels"""
    patterns = list(zip(labels[:-2], labels[1:-1], labels[2:])) # Python is an incredible language
    return patterns

def to_labels(patterns):
    """Takes a list of patterns and makes it a list of labels"""
    return [patterns[0][0]] + [pattern[1] for pattern in patterns] + [patterns[-1][-1]]

def patterns_distance(p1, p2):
    """Calculates some semantic distance between patterns,
      defined here as the cosine distance between the embeddings
      of same-position strings in the patterns"""
    dist = 0
    for i in range(3):
        dist += my_nlp_distance(p1[i], p2[i])
    return dist

def sequence_cleaning(df):
  """Cleans a dataset by identifying the most common sequences of events"""

  patterns = to_patterns(df.CCDOEV.values)
  patterns_frequencies = {pattern:0 for pattern in patterns}
  for pattern in patterns: # Count occurences of each pattern
    patterns_frequencies[pattern]+=1

  c = df.CCDOEV.value_counts().values
  ratio = c.std() / c.mean()
  threshold = int((0.005 if ratio < 1 else 0.01)*len(patterns)) # Empirical

  frequent_patterns = [pattern for pattern, freq in patterns_frequencies.items() if freq >= threshold]
  rare_patterns = [pattern for pattern, freq in patterns_frequencies.items() if freq < threshold]

  mapping = {}
  for x in set(patterns):
    mapping[x] = x
    if x in rare_patterns:
      min = np.inf
      for y in frequent_patterns:
        if patterns_distance(x,y) < min:
          min = patterns_distance(x,y)
          mapping[x] = y

  rebuilt_patterns = [mapping[pattern] for pattern in patterns]

  temp = df.copy()
  temp.CCDOEV = to_labels(rebuilt_patterns)

  return temp

def cleaner(df, method, dist=lev.distance):
  """
  Cleans labels in dataframe df using specified method
  """
  start_time = time.time()

  temp = df.copy()
  if (method=="clustering"):
    temp = clustering_cleaning(temp, dist=dist)

  elif (method=="sequences"):
    temp = sequence_cleaning(temp)

  print(f"Took {round((time.time() - start_time)/60, 3)} minutes")

  return temp

In [None]:
def reference_assess(dataset, pollution_type):
  """
  Computes profiling metrics for non-cleaned datasets
  """
  final_scores = {}

  test_set = pd.read_csv(f"./{dataset}/{dataset}/{dataset}-TEST-CLEAN.csv")

  df_ref = pd.read_csv(f"./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv")
  ref = df_ref.CCDOEV

  final_scores[0.] = {}
  final_scores[0.].update(profiling(test_set, df_ref))
  final_scores[0.].update(conf_matrix(ref, ref))
  final_scores[0.].update({"time":0.})

  # Uncleaned datasets
  for percentage in percentages:
    scores = {}
    final_scores[percentage] = {}
    for i in range(8):
      df = pd.read_csv(f"./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      start_time = time.time()

      scores[i] = {}
      scores[i].update(profiling(test_set, df))
      scores[i].update(conf_matrix(df_ref.CCDOEV, df.CCDOEV))
      scores[i].update({"time":0.})

    final_scores[percentage] = {key:np.average(np.array([scores[i][key] for i in range(8)])) for key in scores[0]}

  return final_scores

In [None]:
def cleaning_experiment(dataset, pollution_type, method, dist=lev.distance):
  """
  Cleans labels in dataframe df using specified method
  """
  final_scores = {}

  initial_time = time.time()

  test_set = pd.read_csv(f"./{dataset}/{dataset}/{dataset}-TEST-CLEAN.csv")

  df_ref = pd.read_csv(f"./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-CLEAN.csv")

  # Unpolluted dataset
  start_time = time.time()
  temp = cleaner(df_ref, method, dist=dist)

  final_scores[0.] = {}
  final_scores[0.].update(profiling(test_set, temp))
  final_scores[0.].update(conf_matrix(df_ref.CCDOEV, temp.CCDOEV))
  final_scores[0.].update({"time":time.time()-start_time})

  # Polluted datasets
  for percentage in percentages:
    scores = {}
    final_scores[percentage] = {}
    for i in range(limit):
      df = pd.read_csv(f"./{dataset}/{dataset}/{dataset}_prepared/{dataset}-TRAIN-{pollution_type}-{percentage}-{i}.csv")
      start_time = time.time()
      temp = cleaner(df, method, dist=dist)

      temp.to_csv(f"./{dataset}/{dataset}/{dataset}_cleaned/{dataset}-TRAIN-{pollution_type}-{percentage}-{i}-CLEANED-{method.upper()}-HAMMING.csv", index=False) # TEMP: remove later

      scores[i] = {}
      scores[i].update(profiling(test_set, temp))
      scores[i].update(conf_matrix(df_ref.CCDOEV, temp.CCDOEV))
      scores[i].update({"time":time.time()-start_time})

    final_scores[percentage] = {key:np.average(np.array([scores[i][key] for i in range(limit)])) for key in scores[0]}

  print(f"Total: took {round((time.time() - initial_time)/60, 3)} minutes")

  return final_scores

In [None]:
from scipy.spatial import distance
def hamm(s1,s2):

  if len(s1)>len(s2):
    s2 = s2.ljust(len(s1), "-")
  else:
    s1 = s1.ljust(len(s2), "-")

  l1 = list(s1)
  l2 = list(s2)
  return distance.hamming(l1, l2)

# BPIC '11

In [None]:
dataset = "BPIC11_f1"

## Reference

In [None]:
pollution_type = "DISTORTED"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.NORND"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

## Clustering

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.013 minutes
Took 0.026 minutes
Took 0.03 minutes
Took 0.023 minutes
Took 0.055 minutes
Took 0.056 minutes
Took 0.061 minutes
Took 0.173 minutes
Took 0.157 minutes
Took 0.166 minutes
Took 0.393 minutes
Took 0.366 minutes
Took 0.292 minutes
Took 0.437 minutes
Took 0.431 minutes
Took 0.443 minutes
Took 0.599 minutes
Took 0.633 minutes
Took 0.6 minutes
Total: took 5.114 minutes


In [None]:
pollution_type = "POLLUTED.NORND"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.003 minutes
Took 0.009 minutes
Took 0.008 minutes
Took 0.008 minutes
Took 0.009 minutes
Took 0.019 minutes
Took 0.011 minutes
Took 0.011 minutes
Took 0.011 minutes
Took 0.011 minutes
Took 0.013 minutes
Took 0.012 minutes
Took 0.012 minutes
Took 0.016 minutes
Took 0.014 minutes
Took 0.014 minutes
Took 0.015 minutes
Took 0.014 minutes
Took 0.019 minutes
Total: took 0.387 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.004 minutes
Took 0.043 minutes
Took 0.041 minutes
Took 0.033 minutes
Took 0.094 minutes
Took 0.099 minutes
Took 0.105 minutes
Took 0.346 minutes
Took 0.371 minutes
Took 0.36 minutes
Took 0.856 minutes
Took 0.767 minutes
Took 0.821 minutes
Took 1.576 minutes
Took 1.674 minutes
Took 1.551 minutes
Took 2.543 minutes
Took 2.961 minutes
Took 2.482 minutes
Total: took 16.883 minutes


## Clustering Hamming

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.053 minutes
Took 0.676 minutes
Took 0.676 minutes
Took 1.543 minutes
Took 1.386 minutes
Took 3.957 minutes
Took 4.109 minutes
Took 7.229 minutes
Took 7.611 minutes
Took 11.419 minutes
Took 11.272 minutes
Took 17.636 minutes
Took 16.71 minutes
Total: took 84.369 minutes


In [None]:
pollution_type = "POLLUTED.NORND"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.016 minutes
Took 0.057 minutes
Took 0.056 minutes
Took 0.075 minutes
Took 0.067 minutes
Took 0.088 minutes
Took 0.086 minutes
Took 0.101 minutes
Took 0.098 minutes
Took 0.124 minutes
Took 0.139 minutes
Took 0.118 minutes
Took 0.124 minutes
Total: took 1.237 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.016 minutes
Took 0.879 minutes
Took 0.872 minutes
Took 3.052 minutes
Took 3.05 minutes
Took 11.901 minutes
Took 11.455 minutes
Took 24.892 minutes
Took 24.786 minutes
Took 43.593 minutes
Took 43.362 minutes
Took 67.417 minutes
Took 67.549 minutes
Total: took 302.906 minutes


## Sequences

In [None]:
method = "sequences"

In [None]:
pollution_type = "DISTORTED"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.811 minutes
Took 1.058 minutes
Took 1.062 minutes
Took 1.053 minutes
Took 0.955 minutes
Took 1.11 minutes
Took 1.162 minutes
Took 0.4 minutes
Took 0.415 minutes
Took 0.408 minutes
Took 0.055 minutes
Took 0.049 minutes
Took 0.05 minutes
Took 0.068 minutes
Took 0.068 minutes
Took 0.068 minutes
Took 0.085 minutes
Took 0.089 minutes
Took 0.093 minutes
Total: took 9.221 minutes


In [None]:
pollution_type = "POLLUTED.NORND"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.608 minutes
Took 0.81 minutes
Took 0.783 minutes
Took 0.767 minutes
Took 0.832 minutes
Took 0.679 minutes
Took 0.73 minutes
Took 0.136 minutes
Took 0.119 minutes
Took 0.112 minutes
Took 0.013 minutes
Took 0.012 minutes
Took 0.012 minutes
Took 0.016 minutes
Took 0.015 minutes
Took 0.013 minutes
Took 0.014 minutes
Took 0.016 minutes
Took 0.014 minutes
Total: took 5.846 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.76 minutes
Took 1.137 minutes
Took 1.13 minutes
Took 1.06 minutes
Took 0.997 minutes
Took 1.421 minutes
Took 1.154 minutes
Took 0.283 minutes
Took 0.305 minutes
Took 0.306 minutes
Took 0.086 minutes
Took 0.079 minutes
Took 0.077 minutes
Took 0.113 minutes
Took 0.115 minutes
Took 0.118 minutes
Took 0.157 minutes
Took 0.126 minutes
Took 0.139 minutes
Total: took 9.759 minutes


# BPIC '15

In [None]:
dataset = "BPIC15_1_f2"

## Reference

In [None]:
pollution_type = "DISTORTED"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.NORND"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

## Clustering

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.005 minutes
Took 0.039 minutes
Took 0.04 minutes
Took 0.038 minutes
Took 0.095 minutes
Took 0.092 minutes
Took 0.093 minutes
Took 0.256 minutes
Took 0.25 minutes
Took 0.246 minutes
Took 0.547 minutes
Took 0.519 minutes
Took 0.487 minutes
Took 0.865 minutes
Took 1.573 minutes
Took 1.556 minutes
Took 2.29 minutes
Took 2.326 minutes
Took 2.216 minutes
Total: took 13.764 minutes


In [None]:
pollution_type = "POLLUTED.NORND"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.013 minutes
Took 0.045 minutes
Took 0.045 minutes
Took 0.042 minutes
Took 0.063 minutes
Took 0.058 minutes
Took 0.054 minutes
Took 0.078 minutes
Took 0.078 minutes
Took 0.074 minutes
Took 0.09 minutes
Took 0.088 minutes
Took 0.087 minutes
Took 0.1 minutes
Took 0.097 minutes
Took 0.092 minutes
Took 0.105 minutes
Took 0.108 minutes
Took 0.104 minutes
Total: took 1.801 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.012 minutes
Took 0.124 minutes
Took 0.098 minutes
Took 0.062 minutes
Took 0.193 minutes
Took 0.15 minutes
Took 0.155 minutes
Took 0.523 minutes
Took 0.519 minutes
Took 0.505 minutes
Took 1.051 minutes
Took 1.273 minutes
Took 1.222 minutes
Took 1.865 minutes
Took 1.844 minutes
Took 1.928 minutes
Took 3.042 minutes
Took 2.947 minutes
Took 3.474 minutes
Total: took 21.216 minutes


## Clustering Hamming

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.034 minutes
Took 0.799 minutes
Took 0.762 minutes
Took 2.199 minutes
Took 2.19 minutes
Took 6.652 minutes
Took 6.645 minutes
Took 12.739 minutes
Took 12.63 minutes
Took 19.9 minutes
Took 19.604 minutes
Took 27.84 minutes
Took 27.672 minutes
Total: took 139.766 minutes


In [None]:
pollution_type = "POLLUTED.NORND"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.034 minutes
Took 0.242 minutes
Took 0.256 minutes
Took 0.372 minutes
Took 0.376 minutes
Took 0.567 minutes
Took 0.566 minutes
Took 0.705 minutes
Took 0.711 minutes
Took 0.813 minutes
Took 0.823 minutes
Took 0.93 minutes
Took 0.934 minutes
Total: took 7.424 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.033 minutes
Took 1.252 minutes
Took 1.263 minutes
Took 4.202 minutes
Took 4.213 minutes
Took 15.388 minutes
Took 15.399 minutes
Took 33.483 minutes
Took 33.525 minutes
Took 68.038 minutes
Took 74.61 minutes
Took 110.526 minutes
Took 98.0 minutes
Total: took 460.038 minutes


## Sequences

In [None]:
method = "sequences"

In [None]:
pollution_type = "DISTORTED"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.307 minutes
Took 0.347 minutes
Took 0.337 minutes
Took 0.297 minutes
Took 0.024 minutes
Took 0.022 minutes
Took 0.022 minutes
Took 0.043 minutes
Took 0.048 minutes
Took 0.056 minutes
Took 0.069 minutes
Took 0.082 minutes
Took 0.083 minutes
Took 0.113 minutes
Took 0.102 minutes
Took 0.112 minutes
Took 0.146 minutes
Took 0.164 minutes
Took 0.152 minutes
Total: took 2.737 minutes


In [None]:
pollution_type = "POLLUTED.NORND"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.269 minutes
Took 0.228 minutes
Took 0.232 minutes
Took 0.227 minutes
Took 0.013 minutes
Took 0.014 minutes
Took 0.013 minutes
Took 0.022 minutes
Took 0.022 minutes
Took 0.022 minutes
Took 0.03 minutes
Took 0.03 minutes
Took 0.03 minutes
Took 0.035 minutes
Took 0.035 minutes
Took 0.035 minutes
Took 0.039 minutes
Took 0.04 minutes
Took 0.04 minutes
Total: took 1.544 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.257 minutes
Took 0.296 minutes
Took 0.287 minutes
Took 0.289 minutes
Took 0.022 minutes
Took 0.025 minutes
Took 0.033 minutes
Took 0.054 minutes
Took 0.052 minutes
Took 0.051 minutes
Took 0.081 minutes
Took 0.081 minutes
Took 0.084 minutes
Took 0.119 minutes
Took 0.114 minutes
Took 0.116 minutes
Took 0.158 minutes
Took 0.183 minutes
Took 0.164 minutes
Total: took 2.674 minutes


# Credit

In [None]:
dataset = "Credit"

## Reference

In [None]:
pollution_type = "DISTORTED-activity"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "SYNONYM"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "HOMONYM"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

## Clustering

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.0 minutes
Took 0.021 minutes
Took 0.019 minutes
Took 0.022 minutes
Took 0.047 minutes
Took 0.048 minutes
Took 0.044 minutes
Took 0.103 minutes
Took 0.108 minutes
Took 0.113 minutes
Took 0.174 minutes
Took 0.173 minutes
Took 0.185 minutes
Took 0.283 minutes
Took 0.251 minutes
Took 0.256 minutes
Took 0.342 minutes
Took 0.335 minutes
Took 0.351 minutes
Total: took 3.172 minutes


In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.0 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Total: took 0.342 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.0 minutes
Took 0.094 minutes
Took 0.093 minutes
Took 0.095 minutes
Took 0.388 minutes
Took 0.399 minutes
Took 0.396 minutes
Took 1.61 minutes
Took 1.541 minutes
Took 1.582 minutes
Took 3.499 minutes
Took 3.274 minutes
Took 3.542 minutes
Took 6.999 minutes
Took 7.049 minutes
Took 6.426 minutes
Took 13.786 minutes
Took 12.094 minutes
Took 12.182 minutes
Total: took 75.395 minutes


In [None]:
pollution_type = "SYNONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.001 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Total: took 0.353 minutes


In [None]:
pollution_type = "HOMONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.0 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.0 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.0 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Total: took 0.284 minutes


## Clustering Hamming

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.02 minutes
Took 1.893 minutes
Took 2.009 minutes
Took 3.964 minutes
Took 3.989 minutes
Took 7.898 minutes
Took 3.667 minutes
Took 11.964 minutes
Took 10.321 minutes
Took 10.077 minutes
Took 8.039 minutes
Took 12.23 minutes
Took 10.14 minutes
Total: took 86.69 minutes


In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.001 minutes
Took 0.004 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.004 minutes
Took 0.003 minutes
Took 0.004 minutes
Total: took 0.229 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.004 minutes
Took 2.617 minutes
Took 2.601 minutes
Took 11.376 minutes
Took 12.418 minutes
Took 46.39 minutes
Took 41.393 minutes
Took 93.472 minutes
Took 97.41 minutes
Took 167.26 minutes
Took 164.542 minutes
Took 258.803 minutes
Took 259.795 minutes
Total: took 1158.256 minutes


In [None]:
pollution_type = "SYNONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.003 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Total: took 0.177 minutes


In [None]:
pollution_type = "HOMONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Total: took 0.162 minutes


## Sequences

In [None]:
method = "sequences"

In [None]:
pollution_type = "DISTORTED-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.006 minutes
Took 1.625 minutes
Took 1.65 minutes
Took 1.618 minutes
Took 2.626 minutes
Took 2.552 minutes
Took 2.766 minutes
Took 4.583 minutes
Took 4.577 minutes
Took 4.654 minutes
Took 5.15 minutes
Took 5.18 minutes
Took 5.313 minutes
Took 5.359 minutes
Took 5.522 minutes
Took 5.282 minutes
Took 2.117 minutes
Took 1.893 minutes
Took 1.874 minutes
Total: took 64.622 minutes


In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.003 minutes
Took 0.199 minutes
Took 0.21 minutes
Took 0.212 minutes
Took 0.318 minutes
Took 0.314 minutes
Took 0.326 minutes
Took 0.572 minutes
Took 0.547 minutes
Took 0.56 minutes
Took 0.577 minutes
Took 0.589 minutes
Took 0.582 minutes
Took 0.531 minutes
Took 0.546 minutes
Took 0.546 minutes
Took 0.104 minutes
Took 0.188 minutes
Took 0.198 minutes
Total: took 7.399 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.003 minutes
Took 2.743 minutes
Took 2.732 minutes
Took 2.704 minutes
Took 4.754 minutes
Took 4.995 minutes
Took 4.876 minutes
Took 8.609 minutes
Took 8.782 minutes
Took 8.751 minutes
Took 9.385 minutes
Took 9.304 minutes
Took 9.245 minutes
Took 8.851 minutes
Took 8.551 minutes
Took 8.774 minutes
Took 1.925 minutes
Took 3.18 minutes
Took 3.175 minutes
Total: took 111.61 minutes


In [None]:
pollution_type = "SYNONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.003 minutes
Took 0.116 minutes
Took 0.114 minutes
Took 0.124 minutes
Took 0.157 minutes
Took 0.149 minutes
Took 0.16 minutes
Took 0.207 minutes
Took 0.197 minutes
Took 0.21 minutes
Took 0.206 minutes
Took 0.218 minutes
Took 0.219 minutes
Took 0.349 minutes
Took 0.354 minutes
Took 0.376 minutes
Took 0.289 minutes
Took 0.319 minutes
Took 0.281 minutes
Total: took 4.32 minutes


In [None]:
pollution_type = "HOMONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.003 minutes
Took 0.02 minutes
Took 0.02 minutes
Took 0.02 minutes
Took 0.023 minutes
Took 0.022 minutes
Took 0.023 minutes
Took 0.022 minutes
Took 0.022 minutes
Took 0.02 minutes
Took 0.018 minutes
Took 0.019 minutes
Took 0.017 minutes
Took 0.017 minutes
Took 0.016 minutes
Took 0.015 minutes
Took 0.015 minutes
Took 0.015 minutes
Took 0.015 minutes
Total: took 0.611 minutes


# Pub

In [None]:
dataset = "Pub"

## Reference

In [None]:
pollution_type = "DISTORTED-activity"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

In [None]:
pollution_type = "SYNONYM"
scores = reference_assess(dataset, pollution_type)
with open(f"{dataset}/{dataset}/REFERENCE_{dataset}_{pollution_type}.pickle", "wb") as f:
  pickle.dump(scores, f)

## Clustering

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.0 minutes
Took 0.017 minutes
Took 0.017 minutes
Took 0.017 minutes
Took 0.039 minutes
Took 0.04 minutes
Took 0.04 minutes
Took 0.089 minutes
Took 0.09 minutes
Took 0.092 minutes
Took 0.145 minutes
Took 0.148 minutes
Took 0.149 minutes
Took 0.201 minutes
Took 0.206 minutes
Took 0.215 minutes
Took 0.263 minutes
Took 0.264 minutes
Took 0.275 minutes
Total: took 2.578 minutes


In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.0 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Total: took 0.331 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.0 minutes
Took 0.108 minutes
Took 0.115 minutes
Took 0.114 minutes
Took 0.459 minutes
Took 0.537 minutes
Took 0.364 minutes
Took 1.472 minutes
Took 1.433 minutes
Took 1.438 minutes
Took 3.325 minutes
Took 3.189 minutes
Took 3.217 minutes
Took 6.261 minutes
Took 5.984 minutes
Took 6.046 minutes
Took 11.081 minutes
Took 11.319 minutes
Took 11.508 minutes
Total: took 68.295 minutes


In [None]:
pollution_type = "SYNONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Took 0.001 minutes
Total: took 0.317 minutes


## Clustering Hamming

In [None]:
method = "clustering"

In [None]:
pollution_type = "DISTORTED-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.001 minutes
Took 0.356 minutes
Took 0.368 minutes
Took 0.851 minutes
Took 0.859 minutes
Took 1.928 minutes
Took 1.975 minutes
Took 3.189 minutes
Took 3.252 minutes
Took 4.382 minutes
Took 4.5 minutes
Took 5.769 minutes
Took 5.778 minutes
Total: took 33.363 minutes


In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.001 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Took 0.003 minutes
Total: took 0.191 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.001 minutes
Took 2.725 minutes
Took 2.72 minutes
Took 13.517 minutes
Took 11.005 minutes
Took 43.782 minutes
Took 43.724 minutes
Took 99.052 minutes
Took 97.571 minutes
Took 182.527 minutes
Took 176.288 minutes
Took 276.939 minutes
Took 287.23 minutes
Total: took 1237.253 minutes


In [None]:
pollution_type = "SYNONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=hamm)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}-HAMMING.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.002 minutes
Took 0.001 minutes
Took 0.002 minutes
Total: took 0.208 minutes


## Sequences

In [None]:
method = "sequences"

In [None]:
pollution_type = "DISTORTED-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.018 minutes
Took 1.113 minutes
Took 1.09 minutes
Took 1.109 minutes
Took 1.847 minutes
Took 1.826 minutes
Took 1.892 minutes
Took 3.415 minutes
Took 3.327 minutes
Took 3.515 minutes
Took 4.67 minutes
Took 4.406 minutes
Took 4.383 minutes
Took 5.714 minutes
Took 4.974 minutes
Took 5.278 minutes
Took 1.063 minutes
Took 1.73 minutes
Took 1.065 minutes
Total: took 52.707 minutes


In [None]:
pollution_type = "POLLUTED.NORND-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.015 minutes
Took 0.145 minutes
Took 0.141 minutes
Took 0.153 minutes
Took 0.221 minutes
Took 0.226 minutes
Took 0.227 minutes
Took 0.324 minutes
Took 0.331 minutes
Took 0.351 minutes
Took 0.428 minutes
Took 0.428 minutes
Took 0.438 minutes
Took 0.394 minutes
Took 0.474 minutes
Took 0.434 minutes
Took 0.635 minutes
Took 0.629 minutes
Took 0.651 minutes
Total: took 6.921 minutes


In [None]:
pollution_type = "POLLUTED.RANDOM-activity"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.015 minutes
Took 2.152 minutes
Took 2.076 minutes
Took 2.099 minutes
Took 3.95 minutes
Took 3.93 minutes
Took 3.918 minutes
Took 7.101 minutes
Took 7.015 minutes
Took 7.086 minutes
Took 8.281 minutes
Took 8.13 minutes
Took 8.23 minutes
Took 8.958 minutes
Took 8.682 minutes
Took 9.041 minutes
Took 1.85 minutes
Took 1.846 minutes
Took 1.854 minutes
Total: took 96.492 minutes


In [None]:
pollution_type = "SYNONYM"
scores = cleaning_experiment(dataset, pollution_type, method, dist=lev.distance)
with open(f"{dataset}/{dataset}/CLEANING_{dataset}_{pollution_type}_{method}.pickle", "wb") as f:
  pickle.dump(scores, f)

Took 0.015 minutes
Took 0.101 minutes
Took 0.099 minutes
Took 0.097 minutes
Took 0.134 minutes
Took 0.133 minutes
Took 0.125 minutes
Took 0.196 minutes
Took 0.185 minutes
Took 0.179 minutes
Took 0.547 minutes
Took 0.479 minutes
Took 0.501 minutes
Took 0.608 minutes
Took 0.546 minutes
Took 0.58 minutes
Took 0.606 minutes
Took 0.581 minutes
Took 0.58 minutes
Total: took 6.566 minutes
