In [5]:
import pandas as pd
from pathlib import Path

from TELF.pipeline import BlockManager 
from TELF.pipeline.blocks import (
    DataBundle,
    LoadTermsBlock,
    TermAttributionBlock,
    VultureCleanBlock,
    SAVE_DIR_BUNDLE_KEY,
    DIR_LIST_BUNDLE_KEY,
    RESULTS_DEFAULT,
    SOURCE_DIR_BUNDLE_KEY
)

In [6]:
attribute_block =   TermAttributionBlock( )
terms_block =    LoadTermsBlock()
vulture_block =  VultureCleanBlock(
    use_substitutions=False,
    init_settings={"n_jobs":-1, 'parallel_backend': 'threading'}
)

[Attribution] needs → (df, terms)   provides → (df, term_representation_df)
[Terms] needs → (dir)   provides → (terms, substitutions, substitutions_reverse, query)
[VultureClean] needs → (df)   provides → (df, vulture_steps)


In [7]:
pipeline_blocks = [terms_block, vulture_block, attribute_block]

bundle = DataBundle({
    SOURCE_DIR_BUNDLE_KEY: Path("..") / ".." / ".." / "data" ,
    SAVE_DIR_BUNDLE_KEY: Path("example_results") / "term_attribution_example" ,
    'df': pd.read_csv(Path("..") / ".." / ".." / "data" / "sample2.csv").head(50)
})

In [8]:
manager = BlockManager(pipeline_blocks, databundle=bundle)
bundle = manager()

Block (tag)                        │ Needs (✓/✗) │ Provides
───────────────────────────────────────────────────────────
LoadTermsBlock (Terms)             │ dir         │ ['terms', 'substitutions', 'substitutions_reverse', 'query']
VultureCleanBlock (VultureClean)   │ df          │ ['df', 'vulture_steps']
TermAttributionBlock (Attribution) │ df, terms   │ ['df', 'term_representation_df']

▶  [1/3] Terms …
✓  [1/3] Terms finished in 1.89s
▶  [2/3] VultureClean …
✓  [2/3] VultureClean finished in 71.72s
▶  [3/3] Attribution …
✓  [3/3] Attribution finished in 0.11s
