In [None]:
import os
nnn = 1
os.environ["OMP_NUM_THREADS"] = str(nnn) # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = str(nnn) # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = str(nnn) # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = str(nnn) # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = str(nnn)  # export NUMEXPR_NUM_THREADS=1

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import pandas as pd
from pathlib import Path
import random

In [None]:
from TELF.pipeline.blocks import DataBundle, SAVE_DIR_BUNDLE_KEY, SOURCE_DIR_BUNDLE_KEY
from TELF.pipeline import BlockManager

from TELF.pipeline.blocks import (
    DataBundle,
    ScopusBlock,
    S2Block,
    VultureCleanBlock,
    BeaverVocabBlock,
    AutoBunnyBlock,
    SquirrelBlock,
    OrcaBlock,
    WolfBlock,
    CleanDuplicatesBlock,
    MergeScopusS2Block,
    CleanAffiliationsBlock,
    BeaverDocWordBlock,
    SemanticHNMFkBlock,
    ArticFoxBlock,
    TermAttributionBlock,
    LoadTermsBlock,
    TermAttributionBlock,
)

In [None]:
# REPLACE THIS WITH YOUR OWN KEYS AND CACHE -- SEE example_hidden_keys.py
from hidden_keys import SCOPUS_KEYS, S2_KEY, SCOPUS_CACHE, S2_CACHE, PENGUIN_SETTINGS
random.shuffle(SCOPUS_KEYS)

# Load Data

In [None]:
df = pd.read_csv(Path("..") / ".." / "data" / "sample_doi.csv")
bundle = DataBundle({
    'DOI.df':df, 
    SAVE_DIR_BUNDLE_KEY: Path('example_results') / 'doi_search_results',
    SOURCE_DIR_BUNDLE_KEY: Path("..") / ".." / "data" / "sample_terms.md",
})

In [None]:
len(df)

# Build the Blocks

In [None]:
scopus_block = ScopusBlock( needs=("DOI.df",),
                            init_settings={'keys':SCOPUS_KEYS, 'name':SCOPUS_CACHE},
                            penguin_settings = PENGUIN_SETTINGS,
                            use_penguin=False)

In [None]:
duplicate_cleaner_block = CleanDuplicatesBlock()

In [None]:
s2_block = S2Block( needs=("CleanDuplicates.df",),
                    init_settings={'key':S2_KEY, 'name':S2_CACHE},
                    penguin_settings = PENGUIN_SETTINGS,
                    use_penguin=False)

In [None]:
merge_frames_block = MergeScopusS2Block()

In [None]:
vulture_block = VultureCleanBlock(verbose=True, 
                                  use_substitutions=True,
                                  init_settings={"n_jobs":1, 'parallel_backend': 'threading'})

In [None]:
vocab_block = BeaverVocabBlock(call_settings={'min_df':2}, 
                               needs=("VultureClean.df",))

In [None]:
auto_bunny_block = AutoBunnyBlock(
    num_hops = 1,
    use_vulture_steps=True,
    init_settings = {
        's2_key': S2_KEY,
        'scopus_keys': SCOPUS_KEYS,
        'cache_dir': Path(SCOPUS_CACHE).parent,
    },
)

In [None]:
term_attribute_block =  TermAttributionBlock()

In [None]:
squirrel_block = SquirrelBlock()

In [None]:
clean_affiliations_block = CleanAffiliationsBlock()

In [None]:
term_attribute_block =  TermAttributionBlock()

In [None]:
terms_block = LoadTermsBlock( call_settings={SOURCE_DIR_BUNDLE_KEY: Path('..') / '..' / 'data' / 'sample_terms.md'})

In [None]:
orca_block = OrcaBlock()

In [None]:
post_expansion_vulture_block = vulture_block.copy(needs=('Orca.df',))

In [None]:
wolf_coauthor_block = WolfBlock(tag="WolfAuthor", category='co-author')
wolf_coaffiliation_block = WolfBlock(tag="WolfAffil", category='co-affiliation')

In [None]:
matrix_block = BeaverDocWordBlock(tag="DocWord", needs=("VultureClean.df", "BeaverVocab.vocabulary",))

In [None]:
semantic_hfactor_block = SemanticHNMFkBlock(
    needs=("DocWord.X", "VultureClean.df", "vocabulary", ),
    init_settings={
        "depth":1, 
        "sample_thresh":4,
        "Ks_deep_max":6,
    },
    call_settings={
        "Ks":range(1, 16),
    }
)

In [None]:
post_process_block = ArticFoxBlock(call_settings={"ollama_model":"llama3.2:3b-instruct-fp16"})

# Block Manager

In [None]:
manager = BlockManager(
    blocks = [
        #scopus_block, 
        duplicate_cleaner_block, 
        #s2_block, 
        merge_frames_block, 
        terms_block,
        vulture_block, 
        vocab_block,
        matrix_block,
        semantic_hfactor_block,
        post_process_block,      
        #auto_bunny_block,
        #squirrel_block,
        clean_affiliations_block,
        orca_block,
        post_expansion_vulture_block,
        term_attribute_block,
        wolf_coauthor_block,
        wolf_coaffiliation_block,  
    ],
    databundle=bundle,  
    progress   = True,          # see which block is executing
    # "memory" or "file" or None
    # if "file",  timestamped logs under results/logs/
    # if "memory", manager.block_logs.keys()
    capture_output=None #'file',
)

In [None]:
bundle = manager()

In [None]:
bundle.keys()