In [1]:
import os
nnn = 1
os.environ["OMP_NUM_THREADS"] = str(nnn) # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = str(nnn) # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = str(nnn) # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = str(nnn) # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = str(nnn)  # export NUMEXPR_NUM_THREADS=1

In [2]:
import pandas as pd
from scipy import sparse

In [3]:
from TELF.pipeline import BlockManager
from TELF.pipeline.blocks import (
    DataBundle,
    SAVE_DIR_BUNDLE_KEY,
    VultureCleanBlock,
    BeaverVocabBlock,
    BeaverDocWordBlock,
    NMFkBlock,
    HNMFkBlock,
    SemanticHNMFkBlock,
    FunctionBlock,
)

# Load Data

In [4]:
df = pd.read_csv(os.path.join("..", "..", "data", "sample2.csv")).head(50)
bundle = DataBundle({'Default.df':df, SAVE_DIR_BUNDLE_KEY:os.path.join("example_results", 'simple_pipeline')})

# Build the Blocks

In [5]:
vulture_block = VultureCleanBlock(verbose=True, init_settings={"n_jobs":1})

[VultureClean] needs → (df)   provides → (df, vulture_steps)


In [6]:
vocab_block = BeaverVocabBlock(tag="BeaverVocab", needs=("VultureClean.df",))

[BeaverVocab] needs → (VultureClean.df)   provides → (vocabulary)


In [7]:
matrix_block = BeaverDocWordBlock(tag="DocWord", needs=("VultureClean.df", "BeaverVocab.vocabulary",))

[DocWord] needs → (VultureClean.df, BeaverVocab.vocabulary)   provides → (X)


In [8]:
def build_mask_from_X(X, thresh: float = 0.0):
    """Return a dense boolean np.ndarray mask where X > thresh."""
    if sparse.issparse(X):
        X = X.toarray()
    return (X > thresh), X

mask_block = FunctionBlock(
    needs     = ("DocWord.X",),      # namespaced input
    provides  = ("MASK", "X"),           # generic output (also latest alias)
    tag = "MASKCreate",
    function_call = build_mask_from_X,
    call_settings = {"thresh": 0.0}, # optional kwargs to your function
)

[MASKCreate] needs → (DocWord.X)   provides → (MASK, X)


In [9]:
factor_block = NMFkBlock(verbose=True, needs=("MASKCreate.X", ), init_settings={"nmf_method":"wnmf", "n_iters":5})

[NMFk] needs → (MASKCreate.X, MASK)   provides → (results, model_path)


In [10]:
hfactor_block = HNMFkBlock(needs=("DocWord.X",))

[HNMFk] needs → (DocWord.X)   provides → (model, saved_path)


In [11]:
semantic_hfactor_block = SemanticHNMFkBlock(
    needs=("DocWord.X", "VultureClean.df", "vocabulary", ),
    init_settings={"depth":1, "sample_thresh":20}
)

[SemanticHNMFk] needs → (DocWord.X, VultureClean.df, vocabulary)   provides → (model, model_path)


# Block Manager

In [12]:
manager = BlockManager(
    blocks=[
        vulture_block, 
        vocab_block, 
        matrix_block, 
        mask_block, 
        factor_block,
        hfactor_block,
        semantic_hfactor_block,
    ],
    databundle=bundle, # optional: will dump final bundle to results/final_bundle.json
    verbose    = True,          # see the green/red dependency table
    progress   = True,          # see which block is executing
    # "memory" or "file" or None
    # if "file",  timestamped logs under results/logs/
    # if "memory", manager.block_logs.keys()
    capture_output="file",
)

Block (tag)                        │ Needs (✓/✗)                             │ Provides
───────────────────────────────────────────────────────────────────────────────────────
VultureCleanBlock (VultureClean)   │ df                                      │ ['df', 'vulture_steps']
BeaverVocabBlock (BeaverVocab)     │ VultureClean.df                         │ ['vocabulary']
BeaverDocWordBlock (DocWord)       │ VultureClean.df, BeaverVocab.vocabulary │ ['X']
FunctionBlock (MASKCreate)         │ DocWord.X                               │ ['MASK', 'X']
NMFkBlock (NMFk)                   │ MASKCreate.X, MASK                      │ ['results', 'model_path']
HNMFkBlock (HNMFk)                 │ DocWord.X                               │ ['model', 'saved_path']
SemanticHNMFkBlock (SemanticHNMFk) │ DocWord.X, VultureClean.df, vocabulary  │ ['model', 'model_path']



In [13]:
bundle = manager()

▶  [1/7] VultureClean …
✓  [1/7] VultureClean finished in 1.77s
▶  [2/7] BeaverVocab …
✓  [2/7] BeaverVocab finished in 0.01s
▶  [3/7] DocWord …
✓  [3/7] DocWord finished in 0.01s
▶  [4/7] MASKCreate …
✓  [4/7] MASKCreate finished in 0.00s
▶  [5/7] NMFk …
✓  [5/7] NMFk finished in 5.02s
▶  [6/7] HNMFk …
✓  [6/7] HNMFk finished in 27.07s
▶  [7/7] SemanticHNMFk …
✓  [7/7] SemanticHNMFk finished in 2.67s


In [14]:
bundle

DataBundle(latest={'df': 'VultureClean', 'save_path': 'Init', 'result_path': 'DataBundle', 'vulture_steps': 'VultureClean', 'vocabulary': 'BeaverVocab', 'X': 'MASKCreate', 'MASK': 'MASKCreate', 'results': 'NMFk', 'model_path': 'SemanticHNMFk', 'model': 'SemanticHNMFk', 'saved_path': 'HNMFk'})

In [15]:
bundle["df"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   eid                   50 non-null     object 
 1   s2id                  50 non-null     object 
 2   doi                   50 non-null     object 
 3   title                 50 non-null     object 
 4   abstract              50 non-null     object 
 5   year                  50 non-null     int64  
 6   authors               50 non-null     object 
 7   author_ids            50 non-null     object 
 8   affiliations          50 non-null     object 
 9   funding               5 non-null      object 
 10  PACs                  8 non-null      object 
 11  publication_name      50 non-null     object 
 12  subject_areas         50 non-null     object 
 13  s2_authors            50 non-null     object 
 14  s2_author_ids         50 non-null     object 
 15  citations             45 

In [16]:
bundle["vocabulary"][:5]

array(['ability', 'abstain', 'accuracy', 'accurate', 'activation'],
      dtype=object)

In [17]:
bundle["VultureClean.df"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   eid                   50 non-null     object 
 1   s2id                  50 non-null     object 
 2   doi                   50 non-null     object 
 3   title                 50 non-null     object 
 4   abstract              50 non-null     object 
 5   year                  50 non-null     int64  
 6   authors               50 non-null     object 
 7   author_ids            50 non-null     object 
 8   affiliations          50 non-null     object 
 9   funding               5 non-null      object 
 10  PACs                  8 non-null      object 
 11  publication_name      50 non-null     object 
 12  subject_areas         50 non-null     object 
 13  s2_authors            50 non-null     object 
 14  s2_author_ids         50 non-null     object 
 15  citations             45 

In [18]:
bundle["Default.df"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   eid               50 non-null     object 
 1   s2id              50 non-null     object 
 2   doi               50 non-null     object 
 3   title             50 non-null     object 
 4   abstract          50 non-null     object 
 5   year              50 non-null     int64  
 6   authors           50 non-null     object 
 7   author_ids        50 non-null     object 
 8   affiliations      50 non-null     object 
 9   funding           5 non-null      object 
 10  PACs              8 non-null      object 
 11  publication_name  50 non-null     object 
 12  subject_areas     50 non-null     object 
 13  s2_authors        50 non-null     object 
 14  s2_author_ids     50 non-null     object 
 15  citations         45 non-null     object 
 16  references        38 non-null     object 
 17 

In [19]:
bundle["DocWord.X"]

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 2499 stored elements and shape (163, 42)>

In [20]:
bundle["MASKCreate.X"]

array([[0.05821804, 0.06360444, 0.07554273, ..., 0.        , 0.        ,
        0.04648162],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.07143849],
       [0.0785472 , 0.08581449, 0.        , ..., 0.        , 0.04863066,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.0984279 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.11483238,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.07143849]], dtype=float32)

In [21]:
bundle["X"]

array([[0.05821804, 0.06360444, 0.07554273, ..., 0.        , 0.        ,
        0.04648162],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.07143849],
       [0.0785472 , 0.08581449, 0.        , ..., 0.        , 0.04863066,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.0984279 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.11483238,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.07143849]], dtype=float32)