In [None]:
import os
nnn = 1
os.environ["OMP_NUM_THREADS"] = str(nnn) # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = str(nnn) # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = str(nnn) # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = str(nnn) # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = str(nnn)  # export NUMEXPR_NUM_THREADS=1


os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from pathlib import Path
import random

In [None]:
from TELF.pipeline.blocks import DataBundle, SAVE_DIR_BUNDLE_KEY
from TELF.pipeline import BlockManager

from TELF.pipeline.blocks import (
    DataBundle,
    VultureCleanBlock,
    BeaverVocabBlock,
    AutoBunnyBlock,
    SquirrelBlock,
    OrcaBlock,
    WolfBlock,
    CleanAffiliationsBlock,
    BeaverDocWordBlock,
    SemanticHNMFkBlock,
    ArticFoxBlock,
    TermAttributionBlock,
    LoadTermsBlock,
    TermAttributionBlock,
    TermSearchBlock,
    FunctionBlock
)

In [None]:
# REPLACE THIS WITH YOUR OWN KEYS AND CACHE
from hidden_keys import SCOPUS_KEYS, S2_KEY, SCOPUS_CACHE, S2_CACHE
random.shuffle(SCOPUS_KEYS)

# Load Data

In [None]:
bundle = DataBundle({SAVE_DIR_BUNDLE_KEY: Path('./example_results/term_search_results')})

# Build the Blocks

In [None]:
terms_block = LoadTermsBlock( call_settings={'path': '../../data/sample_terms.md'})

In [None]:
term_search_block = TermSearchBlock(
    init_settings={
        'scopus':{'keys':SCOPUS_KEYS, 'mode': 'fs','name':SCOPUS_CACHE}, 
        's2':{'key':S2_KEY,'mode': 'fs','name':S2_CACHE}
    }
)

In [None]:
def take_head(df, N: int):
    return df.head(N)

df_reduction_block = FunctionBlock(
    needs=("df",),               
    provides=("df",),        
    function_call=take_head,      
    call_settings={"N": 5},      
    tag="ReduceDf"                
)

In [None]:
vulture_block = VultureCleanBlock(verbose=True, 
                                  use_substitutions=True,
                                  init_settings={"n_jobs":1, 'parallel_backend': 'threading'})

In [None]:
vocab_block = BeaverVocabBlock(call_settings={'min_df':2}, 
                               needs=("VultureClean.df",))

In [None]:
auto_bunny_block = AutoBunnyBlock(
    num_hops = 1,
    use_vulture_steps=True,
    init_settings = {
        's2_key': S2_KEY,
        'scopus_keys': SCOPUS_KEYS,
        'cache_dir': Path(SCOPUS_CACHE).parent,
    },
)

In [None]:
term_attribute_block =  TermAttributionBlock()

In [None]:
squirrel_block = SquirrelBlock()

In [None]:
clean_affiliations_block = CleanAffiliationsBlock()

In [None]:
term_attribute_block =  TermAttributionBlock()

In [None]:
orca_block = OrcaBlock()

In [None]:
post_expansion_vulture_block = vulture_block.copy(needs=('Orca.df',))

In [None]:
wolf_coauthor_block = WolfBlock(tag="WolfAuthor", category='co-author')
wolf_coaffiliation_block = WolfBlock(tag="WolfAffil", category='co-affiliation')

In [None]:
matrix_block = BeaverDocWordBlock(tag="DocWord", needs=("VultureClean.df", "BeaverVocab.vocabulary",))

In [None]:
semantic_hfactor_block = SemanticHNMFkBlock(
    needs=("DocWord.X", "VultureClean.df", "vocabulary", ),
    init_settings={
        "depth":1, 
        "sample_thresh":4,
        "Ks_deep_max":6,
    },
    call_settings={
        "Ks":range(1, 16),
    }
)

In [None]:
post_process_block = ArticFoxBlock(call_settings={"ollama_model":"llama3.2:3b-instruct-fp16"})

# Block Manager

In [None]:
eagle_block =None 
wolf_pack = None 
manager = BlockManager(
    blocks = [
        terms_block,
        eagle_block, 
        df_reduction_block,
        vulture_block, 
        squirrel_block,
        clean_affiliations_block,
        orca_block,
        term_attribute_block,
        wolf_coauthor_block,
        wolf_coaffiliation_block,  
        wolf_pack     
    ],
    databundle=bundle,  
    progress   = True,          # see which block is executing
    # "memory" or "file" or None
    # if "file",  timestamped logs under results/logs/
    # if "memory", manager.block_logs.keys()
    capture_output="file" ,
)

In [None]:
bundle = manager()

In [None]:
manager = BlockManager(
    blocks = [
        terms_block,
        term_search_block,
        df_reduction_block,
        vulture_block, 
        vocab_block,
        matrix_block,
        semantic_hfactor_block,
        auto_bunny_block,
        squirrel_block,
        clean_affiliations_block,
        orca_block,
        post_expansion_vulture_block,
        term_attribute_block,
        wolf_coauthor_block,
        wolf_coaffiliation_block,  
        post_process_block,      
    ],
    databundle=bundle,  
    progress   = True,          # see which block is executing
    # "memory" or "file" or None
    # if "file",  timestamped logs under results/logs/
    # if "memory", manager.block_logs.keys()
    capture_output="file" ,
)