In [1]:
from pathlib import Path
import pandas as pd
from TELF.pipeline import (
    VultureCleanBlock,
    LoadTermsBlock, 
    OcelotFilterBlock, 
    TermAttributionBlock, 
    DataBundle,
    SOURCE_DIR_BUNDLE_KEY,
    SAVE_DIR_BUNDLE_KEY
)

# ───────────────────────────────────────────────────────────────────────
#  1) Prepare your “bundle” with the two inputs your blocks expect:
# ───────────────────────────────────────────────────────────────────────
bundle = DataBundle({
    SOURCE_DIR_BUNDLE_KEY: Path("..") / ".." / ".." / "data" ,
    SAVE_DIR_BUNDLE_KEY: Path("example_results") / "cheetah_term_filter" ,
    'df': pd.read_csv(Path("..") / ".." / ".." / "data" / "sample2.csv").head(50),
    'term_path': Path("..") / ".." / ".." / "data" / "sample_terms3.md",
})
# ───────────────────────────────────────────────────────────────────────
#  2) Instantiate your term‐loader block and run it
# ───────────────────────────────────────────────────────────────────────
# term_block = LoadTermsBlock(
#     call_settings={
#         "drop_conflicts": True,
#         SOURCE_DIR_BUNDLE_KEY : Path("..") / ".." / ".." / "data" / "sample_terms2.md",
#     }
# )
cheetah_block = OcelotFilterBlock(
    load_checkpoint=False,
    init_settings={
        "verbose": True,
        "use_hops": True,            # if your df has a 'type' hop column
    },
    call_settings={
        "positives_mode": "any",     # per-main positives requirement
        "global_positives_mode": "any",
        "emit_nonmatches": False,    # set True to log/show fails in the explain table
    },
    id_field="eid",                   # identifier column
    text_field="text",                # text column to search; will be built if missing
    text_columns={"title": "title", "abstract": "abstract", "fallback_text": "title_abstract"},
)

attribute_block =   TermAttributionBlock()

  from .autonotebook import tqdm as notebook_tqdm


[OcelotFilter] needs → (df, term_path)   provides → (df, ocelot_table)
[Attribution] needs → (df, terms)   provides → (df, term_representation_df)


In [2]:
from TELF.pipeline.block_manager import BlockManager

manager = BlockManager(
    [
        VultureCleanBlock(),
        # term_block,
        # TermAttributionBlock(tag='Initial_Attribution'),
        cheetah_block,
        # TermAttributionBlock(tag='Final_Attribution')
    ],
    bundle
)
result = manager()

[VultureClean] needs → (df)   provides → (df, vulture_steps)


Block (tag)                      │ Needs (✓/✗)   │ Provides
───────────────────────────────────────────────────────────
VultureCleanBlock (VultureClean) │ df            │ ['df', 'vulture_steps']
OcelotFilterBlock (OcelotFilter) │ df, term_path │ ['df', 'ocelot_table']

▶  [1/2] VultureClean …
✓  [1/2] VultureClean finished in 8.55s
▶  [2/2] OcelotFilter …
✓  [2/2] OcelotFilter finished in 0.02s


In [6]:
# ───────────────────────────────────────────────────────────────────────
#  4) Grab your results:
# ───────────────────────────────────────────────────────────────────────
filtered_df    = result["OcelotFilter.df"]
cheetah_table  = result["OcelotFilter.ocelot_table"]

In [7]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   eid                   12 non-null     object 
 1   s2id                  12 non-null     object 
 2   doi                   12 non-null     object 
 3   title                 12 non-null     object 
 4   abstract              12 non-null     object 
 5   year                  12 non-null     int64  
 6   authors               12 non-null     object 
 7   author_ids            12 non-null     object 
 8   affiliations          12 non-null     object 
 9   funding               0 non-null      object 
 10  PACs                  2 non-null      object 
 11  publication_name      12 non-null     object 
 12  subject_areas         12 non-null     object 
 13  s2_authors            12 non-null     object 
 14  s2_author_ids         12 non-null     object 
 15  citations             11 

In [8]:
cheetah_table.head(10)

Unnamed: 0,index,passed,matched_main,matched_positives,matched_negatives,text
0,b0be8bea-217a-421a-9b6e-fff66f54b2cf,True,Edge Computing,Machine Learning,,How Transformers are Revolutionizing NLP Machi...
1,d891110d-415f-4a46-af6c-c92b1cc8b588,True,Self-Supervised Learning,Next Big Thing in AI,,Self-Supervised Learning: The Next Big Thing i...
2,a5024848-53bf-48d3-a367-ea684581dd4d,True,Edge Computing,Machine Learning,,Machine Learning for Edge Computing Applicatio...
3,c3860dc4-c5a3-46ff-866d-764140797a30,True,Self-Supervised Learning,Next Big Thing in AI,,Self-Supervised Learning: The Next Big Thing i...
4,8bd1dc3d-421a-441d-9fa4-21e624d24ced,True,Mystery,,,Unraveling the Mystery of Black Box AI Identif...
5,4df6a9cd-c60f-4d46-a30d-d7c9b0bf0bea,True,Mystery,,,Unraveling the Mystery of Black Box AI Explori...
6,d90ea739-5380-49e3-b4e0-b4569f43f95f,True,Self-Supervised Learning,Next Big Thing in AI,,Self-Supervised Learning: The Next Big Thing i...
7,2d90d690-3b1c-4a7b-8e01-557318499997,True,Self-Supervised Learning,Next Big Thing in AI,,Self-Supervised Learning: The Next Big Thing i...
8,77efd842-d5c5-4787-bd7d-ae5bcd48c71b,True,Mystery,,,Unraveling the Mystery of Black Box AI AI-Powe...
9,12a243c3-bf37-4dab-b908-024b9af5bcfa,True,Mystery,,,Unraveling the Mystery of Black Box AI Cyberse...


In [9]:
result['VultureClean.df'].clean_title_abstract.tolist()

['transformer revolutionize nlp automl future automate science vulnerability pose threat cybersecurity kernel trick svms enable efficient classification separable graph neural network excel process structure graph support vector machine space classification supervisory control acquisition scada serve nervous substation power grid facilitate monitor acquisition control equipment ensure smooth efficient operation substation connect device dependence scada grow risk potential malicious intrusion outage permanent damage grid dimensionality approach principal analysis pca accurate identification anomaly scada scada matrix factorization nmf strong detect anomaly wireless sensor network unsupervised approach normal expect behavior detect unseen type attack anomaly identify event deviate expect behavior approach complex multi-dimensional interaction naturally scada differently tensor decomposition powerful unsupervised machine learn complex multi-faceted activity detail scada event novelly ten