In [6]:
import os
from pathlib import Path
from qwlist import Lazy, QList

In [7]:
VECTORIZERS_DIR = Path('..') / 'stages' / 'vectorizers'
DATACLEANERS_DIR = Path('..') / 'stages' / 'datacleaners'
DATALOADERS_DIR = Path('..') / 'stages' / 'dataloaders'
MODELS_DIR = Path('..') / 'stages' / 'models'
PARAMS_DIR = Path('..') / 'params'

GEN_FILE = 'gen_params2.0.yaml'

In [8]:
IGNORED_VECTORIZERS = ['Vectorizer', 'StyloMetrix', 'HerbertFrozen', 'HerbertFT', 'RoBERTaFrozen', 'RoBERTaFT']

In [9]:
DATA_PAIRS = [
    ('Classics5Authors35Books', 'DummyDatacleaner'),
    ('OldNewspapers', 'DummyDatacleaner'),
    # ('PAN18PL', 'DummyDatacleaner'),
    ('PrusVsSienkiewicz', 'DummyDatacleaner'),
    # ('PrusVsSienkiewiczAV', 'DummyDatacleaner'),
    ('StarWarsFanfic', 'DummyDatacleaner'),
    ('TweeterCyberbullying', 'DummyDatacleaner'),
    ('WritingStyle', 'DummyDatacleaner'),
    ('EroticVsOthers', 'DummyDatacleaner'),    # [WinError 267] The directory name is invalid: 'datasets_raw\\LOL24Dataset\\dramaty\\10014.txt'
    ('StarWarsFanficShort', 'DummyDatacleaner')
]

CLASSIFICATION_MODELS = [
    'RandomForest',
    'MLP',
    'LogisticRegression'
]

CLUSTERIZATION_MODELS = [
    'DBSCAN',
    'KMeans',
    'AffinityPropagation'
]

In [10]:
text = 'load:\n'

for loader, _ in DATA_PAIRS:
    text += f'   - dataloader: {loader}\n'

text += '\nclean:\n'
for loader, cleaner in DATA_PAIRS:
    text += f'   - dataloader: {loader}\n'
    text += f'     datacleaner: {cleaner}\n'

text += '\nvectorize:\n'
for loader, cleaner in DATA_PAIRS:
    text += '#' * 50
    text += f' {loader}\n'
    vectorizers = filter(lambda v: v not in IGNORED_VECTORIZERS, map(lambda s: s[:-3], [file for file in os.listdir(VECTORIZERS_DIR) if file.endswith('.py')]))
    for vec in vectorizers:
        text += f'  - dataloader: {loader}\n'
        text += f'    datacleaner: {cleaner}\n'
        text += f'    vectorizer: {vec}\n'


text += '\nevaluate_classification:\n'
for loader, cleaner in DATA_PAIRS:
    text += '#' * 50
    text += f' {loader}\n'
    vectorizers = filter(lambda v: v not in IGNORED_VECTORIZERS, map(lambda s: s[:-3], [file for file in os.listdir(VECTORIZERS_DIR) if file.endswith('.py')]))
    for vec in vectorizers:
        text += f'  - dataloader: {loader}\n'
        text += f'    datacleaner: {cleaner}\n'
        text += f'    vectorizer: {vec}\n'

text += '\nevaluate_clustering:\n'
for loader, cleaner in DATA_PAIRS:
    text += '#' * 50
    text += f' {loader}\n'
    vectorizers = filter(lambda v: v not in IGNORED_VECTORIZERS, map(lambda s: s[:-3], [file for file in os.listdir(VECTORIZERS_DIR) if file.endswith('.py')]))
    for vec in vectorizers:
        text += f'  - dataloader: {loader}\n'
        text += f'    datacleaner: {cleaner}\n'
        text += f'    vectorizer: {vec}\n'

params: QList[str] = (
    Lazy(os.walk(PARAMS_DIR))
    .flatmap(lambda rdf: rdf[2])
    .filter(lambda file: file.endswith('.yaml'))
    .map(lambda s: s[:-5])
    .collect()
)

text += '\nclassification_models:\n'
for model in CLASSIFICATION_MODELS:
    model_params = params.filter(lambda m: m.startswith(model))
    for param in model_params:
        text += f'  - model: {model}\n'
        text += f'    params: {param}\n'

text += '\nclustering_models:\n'
for model in CLUSTERIZATION_MODELS:
    model_params = params.filter(lambda m: m.startswith(model))
    for param in model_params:
        text += f'  - model: {model}\n'
        text += f'    params: {param}\n'



with open(GEN_FILE, 'w+', encoding='utf-8') as file:
    file.write(text)