In [None]:
import os
nnn = 1
os.environ["OMP_NUM_THREADS"] = str(nnn) # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = str(nnn) # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = str(nnn) # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = str(nnn) # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = str(nnn)  # export NUMEXPR_NUM_THREADS=1

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import pandas as pd
from pathlib import Path
import random

In [None]:
from TELF.pipeline.blocks import DataBundle, SAVE_DIR_BUNDLE_KEY, SOURCE_DIR_BUNDLE_KEY
from TELF.pipeline import BlockManager, RepeatLoopBlock

from TELF.pipeline.blocks import (
    DataBundle,
    ScopusBlock,
    S2Block,
    VultureCleanBlock,
    SquirrelBlock,
    AutoBunnySimpleBlock,
    CleanDuplicatesBlock,
    MergeScopusS2Block,
    LoadTermsBlock,
    PipelineSummaryBlock,
    OcelotFilterBlock,
    TermTableBlock
)

from TELF.pre_processing.Squirrel.pruners import EmbeddingPruner


In [None]:
# REPLACE THIS WITH YOUR OWN KEYS AND CACHE -- SEE example_hidden_keys.py
from hidden_keys import SCOPUS_KEYS, S2_KEY, SCOPUS_CACHE, S2_CACHE
random.shuffle(SCOPUS_KEYS)

# Load Data

In [None]:
df = pd.read_csv(Path("..") / ".." / "data" / "sample_doi.csv")
bundle = DataBundle({
    'DOI.df':df, 
    SAVE_DIR_BUNDLE_KEY: Path('example_results') / 'doi_search_results',
    "term_path": Path("..") / ".." / "data" / "sample_terms3.md",
})

# Build the Blocks

In [None]:
scopus_block = ScopusBlock( needs=("DOI.df",),
                            init_settings={'keys':SCOPUS_KEYS, 'name':SCOPUS_CACHE},
                            use_penguin=False)

In [None]:
duplicate_cleaner_block = CleanDuplicatesBlock()

In [None]:
s2_block = S2Block( needs=("CleanDuplicates.df",),
                    init_settings={'key':S2_KEY, 'name':S2_CACHE},
                    use_penguin=False)

In [None]:
merge_frames_block = MergeScopusS2Block()

In [None]:
vulture_block = VultureCleanBlock(verbose=True, 
                                  use_substitutions=True,
                                  init_settings={"n_jobs":1, 'parallel_backend': 'threading'})

In [None]:
terms_block = LoadTermsBlock( call_settings={SOURCE_DIR_BUNDLE_KEY: Path('..') / '..' / 'data' / 'sample_terms3.md'})

In [None]:
auto_bunny_block = AutoBunnySimpleBlock(
    num_hops = 1,
    use_vulture_steps=False,
    use_substitutions=False,
    init_settings = {
        's2_key': S2_KEY,
        'scopus_keys': SCOPUS_KEYS,
        'cache_dir': Path(SCOPUS_CACHE).parent,
    },
    verbose=True
)

In [None]:
ocelot_block = OcelotFilterBlock(
    load_checkpoint=False,
    init_settings={
        "verbose": True,
        "use_hops": True,            # if your df has a 'type' hop column
    },
    call_settings={
        "positives_mode": "any",     # per-main positives requirement
        "global_positives_mode": "any",
        "emit_nonmatches": True,    # set True to log/show fails in the explain table
    },
    id_field="eid",                   # identifier column
    text_field="text",                # text column to be built 
    text_columns={"title": "title", "abstract": "abstract", "fallback_text": "clean_title_abstract"},
)

In [None]:
summary_block = PipelineSummaryBlock()

In [None]:
emb_pruner = EmbeddingPruner(
    embedding_model="SPECTER",
    distance_std_factor=5.0,
    overwrite_embeddings=False,
    use_gpu=True,
    verbose=True,
)
squirrel_block = SquirrelBlock(  
    low_count_backup = None,
    init_settings = {
        'data_column':  'text',
        'label_column': 'type',
        'reference_label': 0,
        'aggregrate_prune': True,
        'pipeline':[emb_pruner]
    }
)

In [None]:
auto_bunny_unrolled_block = RepeatLoopBlock(
    subblocks=[
        auto_bunny_block,
        vulture_block,
        ocelot_block,
        #squirrel_block,
        summary_block,
        TermTableBlock(use_checkpoint=False)

    ], 
    n_iter=2,   # 5
    clone=False,              # carry stsate forward between iterations
    # redirect_save_dir=True,   # writes into ./TrainLoop/iter_00/, iter_01/, â€¦
    tag="bunny_unrolled",
    capture_output=None #'file',
)

# Block Manager

In [None]:
manager = BlockManager(
    blocks = [
        scopus_block,
        duplicate_cleaner_block,
        s2_block,
        merge_frames_block, 
        terms_block,
        vulture_block,
        auto_bunny_unrolled_block,
        TermTableBlock(use_checkpoint=False)
    ],
    databundle=bundle,  
    progress   = True,          # see which block is executing
    # "memory" or "file" or None
    # if "file",  timestamped logs under results/logs/
    # if "memory", manager.block_logs.keys()
    capture_output=None #'file',
)

In [None]:
bundle = manager()

In [None]:
bundle.keys()

In [None]:
bundle.df