In [1]:
from pathlib import Path
import pandas as pd
from TELF.pipeline import (
    VultureCleanBlock,
    LoadTermsBlock, 
    CheetahFilterBlock, 
    TermAttributionBlock, 
    DataBundle,
    SOURCE_DIR_BUNDLE_KEY,
    SAVE_DIR_BUNDLE_KEY
)

# ───────────────────────────────────────────────────────────────────────
#  1) Prepare your “bundle” with the two inputs your blocks expect:
# ───────────────────────────────────────────────────────────────────────
bundle = DataBundle({
    SOURCE_DIR_BUNDLE_KEY: Path("..") / ".." / ".." / "data" ,
    SAVE_DIR_BUNDLE_KEY: Path("example_results") / "cheetah_term_filter" ,
    'df': pd.read_csv(Path("..") / ".." / ".." / "data" / "sample2.csv").head(50)
})
# ───────────────────────────────────────────────────────────────────────
#  2) Instantiate your term‐loader block and run it
# ───────────────────────────────────────────────────────────────────────
term_block = LoadTermsBlock(
    call_settings={
        "drop_conflicts": True,
        SOURCE_DIR_BUNDLE_KEY : Path("..") / ".." / ".." / "data" / "sample_terms2.md",
    }
)
cheetah_block = CheetahFilterBlock(
    load_checkpoint=False,
    call_settings={
        "in_title": True,
        "in_abstract": True,
        "ngram_ordered": False,
        "do_results_table": True,
    }
)

attribute_block =   TermAttributionBlock()

[Terms] needs → (dir)   provides → (terms, substitutions, substitutions_reverse, query)
[CheetahFilter] needs → (df, query)   provides → (df, cheetah_table)
[Attribution] needs → (df, terms)   provides → (df, term_representation_df)


In [2]:
from TELF.pipeline.block_manager import BlockManager

manager = BlockManager(
    [
        VultureCleanBlock(),
        term_block,
        TermAttributionBlock(tag='Initial_Attribution'),
        cheetah_block,
        TermAttributionBlock(tag='Final_Attribution')
    ],
    bundle
)
result = manager()

[VultureClean] needs → (df)   provides → (df, vulture_steps)
[Initial_Attribution] needs → (df, terms)   provides → (df, term_representation_df)
[Final_Attribution] needs → (df, terms)   provides → (df, term_representation_df)


Block (tag)                                │ Needs (✓/✗) │ Provides
───────────────────────────────────────────────────────────────────
VultureCleanBlock (VultureClean)           │ df          │ ['df', 'vulture_steps']
LoadTermsBlock (Terms)                     │             │ ['terms', 'substitutions', 'substitutions_reverse', 'query']
TermAttributionBlock (Initial_Attribution) │ df, terms   │ ['df', 'term_representation_df']
CheetahFilterBlock (CheetahFilter)         │ df, query   │ ['df', 'cheetah_table']
TermAttributionBlock (Final_Attribution)   │ df, terms   │ ['df', 'term_representation_df']

▶  [1/5] VultureClean …
✓  [1/5] VultureClean finished in 68.09s
▶  [2/5] Terms …
✓  [2/5] Terms finished in 1.91s
▶  [3/5] Initial_Attribution …
✓  [3/5] Initial_Attribution finished in 0.08s
▶  [4/5] CheetahFilter …
✓  [4/5] CheetahFilter finished in 0.04s
▶  [5/5] Final_Attribution …
✓  [5/5] Final_Attribution finished in 0.04s


In [3]:
result.terms

[{'malware family': {'positives': ['classification'],
   'negatives': ['labeling', 'early']}},
 {'decision-making': {'positives': ['reward'], 'negatives': []}},
 {'pre-train model': {'positives': ['supervised'], 'negatives': []}},
 {'pre-train': {'positives': ['supervised'], 'negatives': []}}]

In [4]:
# ───────────────────────────────────────────────────────────────────────
#  4) Grab your results:
# ───────────────────────────────────────────────────────────────────────
filtered_df    = result["CheetahFilter.df"]
cheetah_table  = result["CheetahFilter.cheetah_table"]

In [5]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   eid                   11 non-null     object 
 1   s2id                  11 non-null     object 
 2   doi                   11 non-null     object 
 3   title                 11 non-null     object 
 4   abstract              11 non-null     object 
 5   year                  11 non-null     int64  
 6   authors               11 non-null     object 
 7   author_ids            11 non-null     object 
 8   affiliations          11 non-null     object 
 9   funding               3 non-null      object 
 10  PACs                  1 non-null      object 
 11  publication_name      11 non-null     object 
 12  subject_areas         11 non-null     object 
 13  s2_authors            11 non-null     object 
 14  s2_author_ids         11 non-null     object 
 15  citations             11 

In [6]:
cheetah_table.head(10)

Unnamed: 0,filter_type,filter_value,num_papers,included_ids,included_pos
0,query,decision-making,18,64;66;6;7;8;73;42;43;44;15;16;17;18;19;60;61;6...,"[64, 66, 6, 7, 8, 73, 42, 43, 44, 15, 16, 17, ..."
1,query,malware family,25,0;3;4;6;7;9;13;14;15;16;25;26;32;34;36;39;40;4...,"[0, 3, 4, 6, 7, 9, 13, 14, 15, 16, 25, 26, 32,..."
2,query,pre-train,5,32;4;40;25;59,"[32, 4, 40, 25, 59]"
3,query,pre-train model,5,32;4;40;25;59,"[32, 4, 40, 25, 59]"


In [None]:
result['VultureClean.df'].clean_title_abstract.tolist()