In [26]:
from enum import Enum
import yaml

In [2]:
ALL_VECTORIZERS = [
    "BigramMorphTagVectorizer100",
    "BigramMorphTagVectorizer370",
    "CountVectorizer1000",
    "CountVectorizer5000",
    "DPEBPVectorizer100Avg",
    "FullMorphTagVectorizer",
    "HerbertVectorizer",
    "SpacyMorphTagVectorizer",
    "StyloMetrix",
    "TfidfVectorizer1000",
    "TfidfVectorizer5000"
]

In [3]:
class Task(Enum):
    classification = 1
    clustering = 2

DATALOADERS_DATACLEANERS = [
    ("TweeterCyberbullying", "DummyDatacleaner", [Task.classification]),
    ("PrusVsSienkiewicz", "DummyDatacleaner", [Task.classification])
]

In [4]:
EXCLUDES = {
    # "TweeterCyberbullying" : ["HerbertVectorizer"]
}

In [30]:
load = []
clean = []
vectorize = []
evaluate_classification = []
evaluate_clustering = []

for dataloader, datacleaner, tasks in DATALOADERS_DATACLEANERS:
    load.append({
        "dataloader": dataloader
    })
    clean.append({
        "dataloader": dataloader,
        "datacleaner": datacleaner
    })
    excluded = EXCLUDES.get(dataloader, [])

    for vectorizer in ALL_VECTORIZERS:
        if vectorizer in excluded:
            continue
        vectorize_params = {
            "dataloader": dataloader,
            "datacleaner": datacleaner,
            "vectorizer": vectorizer
        }
        vectorize.append(vectorize_params.copy())
        if Task.classification in tasks:
            evaluate_classification.append(vectorize_params.copy())
        if Task.clustering in tasks:
            evaluate_clustering.append(vectorize_params.copy())

params = {
    "load": load,
    "clean": clean,
    "vectorize": vectorize,
    "evaluate_classification": evaluate_classification,
    "evaluate_clustering": evaluate_clustering
}

with open("./models.yaml", 'r') as file:
    models = yaml.safe_load(file)

for key in models.keys():
    params[key] = models[key]

with open("./generated_params.yaml", "w") as file:
    yaml.dump(params, file, default_flow_style=False, sort_keys=False)