# Libraries

In [1]:
from TELF.pipeline import BlockManager
from TELF.pipeline.blocks import (
    DataBundle,
    SAVE_DIR_BUNDLE_KEY,
    SOURCE_DIR_BUNDLE_KEY,
    VultureCleanBlock,
    BeaverVocabBlock,
    BeaverDocWordBlock,
    NMFkBlock,
    HNMFkBlock,
    SemanticHNMFkBlock,
    FunctionBlock,
    ClusteringAnalyzerBlock,
    LoadDfBlock,
    LabelAnalyzerBlock
)
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.sparse as ss
import pickle
import pandas as pd
import os 

# Load Data

In [2]:
df = pd.read_csv(Path("..") / ".." / ".." /"data" / "sample2.csv").head(50)
df.info()

EXAMPLE_OUTPUT = Path( "example_results") / 'post_process_example' 
bundle = DataBundle({'Default.df':df, 
                     SAVE_DIR_BUNDLE_KEY: EXAMPLE_OUTPUT,
                     SOURCE_DIR_BUNDLE_KEY: EXAMPLE_OUTPUT})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   eid               50 non-null     object 
 1   s2id              50 non-null     object 
 2   doi               50 non-null     object 
 3   title             50 non-null     object 
 4   abstract          50 non-null     object 
 5   year              50 non-null     int64  
 6   authors           50 non-null     object 
 7   author_ids        50 non-null     object 
 8   affiliations      50 non-null     object 
 9   funding           5 non-null      object 
 10  PACs              8 non-null      object 
 11  publication_name  50 non-null     object 
 12  subject_areas     50 non-null     object 
 13  s2_authors        50 non-null     object 
 14  s2_author_ids     50 non-null     object 
 15  citations         45 non-null     object 
 16  references        38 non-null     object 
 17 

In [3]:
vulture_block = VultureCleanBlock( init_settings={"n_jobs":1})
vocab_block = BeaverVocabBlock()
matrix_block = BeaverDocWordBlock()
factor_block = NMFkBlock(init_settings={"n_perturbs": 2, "n_iters":2})
hfactor_block = HNMFkBlock( init_settings={"nmfk_params":{"n_perturbs": 2, "n_iters":2}})

nmfk_analyzer = ClusteringAnalyzerBlock(
    tag='NMFAnalyzer',
    mode='nmf'
)
hnmfk_analyzer = ClusteringAnalyzerBlock(
    tag='HNMFAnalyzer',
    mode='hnmf'
)
extracted_cluster_only_nmfk_analyzer = ClusteringAnalyzerBlock(
    tag='ClusterOnlyAnalyzer',
    call_settings={"cluster_col": "cluster"},
    mode='label'
)
no_cluster_analyzer = ClusteringAnalyzerBlock(
    tag='NoClusterAnalyzer',
    mode=None
)

# ── after importing LabelAnalyzerBlock ──────────────────────────────────────
nmf_labels = LabelAnalyzerBlock(
    tag   = "NMFLabels",
    needs = ("NMFAnalyzer.clusters_path",)        # ← csv path from NMFAnalyzer
)

hnmfk_labels = LabelAnalyzerBlock(
    tag   = "HNMFkLabels",
    needs = ("HNMFAnalyzer.clusters_path",)       # ← list of csv paths from HNMFAnalyzer
)

cluster_only_labels = LabelAnalyzerBlock(
    tag   = "ClusterOnlyLabels",
    needs = ("ClusterOnlyAnalyzer.clusters_path",)   # ← csv path produced by label-mode run
)

no_cluster_labels = LabelAnalyzerBlock(
    tag   = "NoClusterLabels",
    needs = ("NoClusterAnalyzer.clusters_path",)  # ← csv path from pass-through analyzer
)


get_cluster_df = LoadDfBlock(
    path_extension=Path("HNMFk") / "depth_0" / 'Root',
    recursive=False,
    regex=r"cluster_for_k=.*\.csv"
)

[VultureClean] needs → (df)   provides → (df, vulture_steps)
[BeaverVocab] needs → (df)   provides → (vocabulary)
[BeaverDW] needs → (df, vocabulary)   provides → (X)
[NMFk] needs → (X)   provides → (nmfk_model, nmfk_model_path)
[HNMFk] needs → (X)   provides → (hnmfk_model, saved_path)
[NMFAnalyzer] needs → (df, nmfk_model, nmfk_model_path, vocabulary)   provides → (clusters_path)
[HNMFAnalyzer] needs → (df, hnmfk_model, vocabulary)   provides → (clusters_path)
[ClusterOnlyAnalyzer] needs → (df)   provides → (clusters_path)
[NoClusterAnalyzer] needs → (df)   provides → (clusters_path)
[NMFLabels] needs → (NMFAnalyzer.clusters_path)   provides → (result, label_paths)
[HNMFkLabels] needs → (HNMFAnalyzer.clusters_path)   provides → (result, label_paths)
[ClusterOnlyLabels] needs → (ClusterOnlyAnalyzer.clusters_path)   provides → (result, label_paths)
[NoClusterLabels] needs → (NoClusterAnalyzer.clusters_path)   provides → (result, label_paths)
[LoadDF] needs → (dir)   provides → (df, df_

In [4]:
manager = BlockManager(
    blocks = [
        vulture_block,
        vocab_block,
        matrix_block,

        factor_block,
        nmfk_analyzer,
        nmf_labels,

        hfactor_block,
        hnmfk_analyzer,
        hnmfk_labels,

        no_cluster_analyzer,
        no_cluster_labels,

        get_cluster_df,
        extracted_cluster_only_nmfk_analyzer,
        cluster_only_labels,
    ],
    databundle = bundle,
    verbose    = True,
    progress   = True,
    capture_output = "file",
)


Block (tag)                                   │ Needs (✓/✗)                                 │ Provides
──────────────────────────────────────────────────────────────────────────────────────────────────────
VultureCleanBlock (VultureClean)              │ df                                          │ ['df', 'vulture_steps']
BeaverVocabBlock (BeaverVocab)                │ df                                          │ ['vocabulary']
BeaverDocWordBlock (BeaverDW)                 │ df, vocabulary                              │ ['X']
NMFkBlock (NMFk)                              │ X                                           │ ['nmfk_model', 'nmfk_model_path']
ClusteringAnalyzerBlock (NMFAnalyzer)         │ df, nmfk_model, nmfk_model_path, vocabulary │ ['clusters_path']
LabelAnalyzerBlock (NMFLabels)                │ NMFAnalyzer.clusters_path                   │ ['result', 'label_paths']
HNMFkBlock (HNMFk)                            │ X                                           │ ['hnmfk_model

In [5]:
bundle = manager()

▶  [1/14] VultureClean …
✓  [1/14] VultureClean finished in 0.01s
▶  [2/14] BeaverVocab …
✓  [2/14] BeaverVocab finished in 0.01s
▶  [3/14] BeaverDW …
✓  [3/14] BeaverDW finished in 0.01s
▶  [4/14] NMFk …
✓  [4/14] NMFk finished in 3.00s
▶  [5/14] NMFAnalyzer …
✓  [5/14] NMFAnalyzer finished in 21.28s
▶  [6/14] NMFLabels …
✓  [6/14] NMFLabels finished in 13.13s
▶  [7/14] HNMFk …
✓  [7/14] HNMFk finished in 0.00s
▶  [8/14] HNMFAnalyzer …
✓  [8/14] HNMFAnalyzer finished in 0.00s
▶  [9/14] HNMFkLabels …
✓  [9/14] HNMFkLabels finished in 17.50s
▶  [10/14] NoClusterAnalyzer …
✓  [10/14] NoClusterAnalyzer finished in 1.21s
▶  [11/14] NoClusterLabels …
✓  [11/14] NoClusterLabels finished in 0.97s
▶  [12/14] LoadDF …
✓  [12/14] LoadDF finished in 0.00s
▶  [13/14] ClusterOnlyAnalyzer …
✓  [13/14] ClusterOnlyAnalyzer finished in 13.45s
▶  [14/14] ClusterOnlyLabels …
✓  [14/14] ClusterOnlyLabels finished in 6.35s


In [6]:
bundle.print_tags_and_keys()

'BeaverDW': ['X']
'BeaverVocab': ['vocabulary']
'ClusterOnlyAnalyzer': ['clusters_path']
'ClusterOnlyLabels': ['result', 'label_paths']
'DataBundle': ['result_path']
'Default': ['df']
'HNMFAnalyzer': ['clusters_path']
'HNMFk': ['hnmfk_model', 'saved_path']
'HNMFkLabels': ['result', 'label_paths']
'Init': ['save_path', 'dir']
'LoadDF': ['df', 'df_paths']
'NMFAnalyzer': ['clusters_path']
'NMFLabels': ['result', 'label_paths']
'NMFk': ['nmfk_model', 'nmfk_model_path']
'NoClusterAnalyzer': ['clusters_path']
'NoClusterLabels': ['result', 'label_paths']
'VultureClean': ['df', 'vulture_steps']


In [7]:
bundle.NMFLabels


NamespaceView(tag='NMFLabels', keys=['result', 'label_paths'])

In [8]:
bundle.NMFLabels.result


{'example_results/post_process_example/NMFk/cluster_for_k=20.csv': {17: 'Machine Learning for 19.0 Malware Detection Models',
  19: 'Quantum Inspired Neural Network Optimization Techniques',
  16: 'Reinforcement Learning for Robust Model Training',
  18: 'Anomaly Detection Using 18.0 Matrix Models',
  11: 'Malware Family Classification Using HNMFk Classifier Approach',
  6: 'Neural Architecture Search for Dense Matrix Optimization on GPU Clusters',
  9: 'Federated Learning for Collaborative Filtering Systems',
  13: 'Anomaly Detection',
  12: 'Malware Novelty Detection Using Hierarchical Tensor Factorization',
  8: 'Machine Learning'}}

In [11]:
bundle.HNMFkLabels


NamespaceView(tag='HNMFkLabels', keys=['result', 'label_paths'])