# Libraries

In [19]:
import numpy as np
import scipy.sparse as ss
import pickle
import pandas as pd
import os 

In [20]:
from TELF.pre_processing import Vulture
from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner
from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

In [21]:
from TELF.factorization.HNMFk import HNMFk

In [22]:
from TELF.pre_processing import Beaver

In [23]:
from TELF.post_processing import ArcticFox

# Load Data

In [24]:
df = pd.read_csv(os.path.join("..", "..", "data", "sample2.csv"))
df = df.head(50).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   eid               50 non-null     object 
 1   s2id              50 non-null     object 
 2   doi               50 non-null     object 
 3   title             50 non-null     object 
 4   abstract          50 non-null     object 
 5   year              50 non-null     int64  
 6   authors           50 non-null     object 
 7   author_ids        50 non-null     object 
 8   affiliations      50 non-null     object 
 9   funding           5 non-null      object 
 10  PACs              8 non-null      object 
 11  publication_name  50 non-null     object 
 12  subject_areas     50 non-null     object 
 13  s2_authors        50 non-null     object 
 14  s2_author_ids     50 non-null     object 
 15  citations         45 non-null     object 
 16  references        38 non-null     object 
 17 

# Clean Text

In [25]:
steps = [
    RemoveNonEnglishCleaner(ascii_ratio=0.9, stopwords_ratio=0.25),
    SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [
                      'standardize_hyphens',
                      'isolate_frozen',
                      'remove_copyright_statement',
                      'remove_stop_phrases',
                      'make_lower_case',
                      'remove_formulas',
                      'normalize',
                      'remove_next_line',
                      'remove_email',
                      'remove_()',
                      'remove_[]',
                      'remove_special_characters',
                      'remove_nonASCII_boundary',
                      'remove_nonASCII',
                      'remove_tags',
                      'remove_stop_words',
                      'remove_standalone_numbers',
                      'remove_extra_whitespace',
                      'min_characters',
                  ]
                 ),
    LemmatizeCleaner('spacy'),
]

In [26]:
vulture = Vulture(n_jobs=1, verbose=10)
df = vulture.clean_dataframe(df=df, 
                        columns=["abstract", "title"],
                        append_to_original_df=True,
                        concat_cleaned_cols=True
                        )

[Vulture]: Cleaning 50 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module
100%|██████████| 50/50 [00:00<00:00, 287.65it/s]
100%|██████████| 1/1 [00:00<00:00,  5.60it/s]


In [27]:
df.clean_abstract_title

0     vulnerabilities pose threat cybersecurity kern...
1     cybersecurity frameworks nist guidelines risk ...
2     bias-variance tradeoff determines models abili...
3     efficient distributed implementation truncated...
4     autoencoders dimensionality reduction anomaly ...
5     ensemble bagging boosting combine multiple mod...
6     vulnerabilities pose threat cybersecurity gene...
7     preprocessing steps normalization handling mis...
8     feature engineering plays crucial role improvi...
9     federated learning ai models trained decentral...
10    principal analysis pca reducing dimensionality...
11    malware dangerous costly cyber threats nationa...
12    cybersecurity frameworks nist guidelines risk ...
13    transfer learning pre-trained models fine-tune...
14    autoencoders effectively unsupervised feature ...
15    identification family malware specimen belongs...
16    ensemble bagging boosting combine multiple mod...
17    blockchain technology enhances security tr

# Build The Vocabulary and the Document-Term Matrix

In [28]:
DATA_COLUMN = 'clean_abstract_title'
RESULTS = "result_example"
HIGHLIGHT_WORDS = ['analysis', 'tensor']
HIGHLIGHT_WEIGHTS = [2 for i in HIGHLIGHT_WORDS]
beaver = Beaver()
os.makedirs(RESULTS, exist_ok=True)
settings = {
    "dataset" : df,
    "target_column" : DATA_COLUMN,
    'highlighting': HIGHLIGHT_WORDS,
    'weights':HIGHLIGHT_WEIGHTS,
    "matrix_type" : "tfidf",
    "save_path" : RESULTS
}
X, vocabulary = beaver.documents_words(**settings)

In [29]:
X = X.T.tocsr()
X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 4968 stored elements and shape (607, 50)>

In [30]:
assert X.shape[1] == len(df)

In [31]:
vocabulary[:10]

array(['abstaining', 'accelerated', 'accuracy', 'accurate', 'achieved',
       'acquisition', 'activation', 'activity', 'addition', 'additional'],
      dtype=object)

In [32]:
len(vocabulary)

607

# Factorize with HNMFk

In [33]:
# Define the range of cluster numbers (K) to search over
Ks = np.arange(2, 10, 1)  # From 2 to 29 inclusive

# Number of perturbations and iterations to run
perts = 2  # Number of perturbed runs to estimate stability
iters = 2  # Number of iterations for each perturbation

# Small perturbation epsilon added to input data
eps = 0.025

# Initialization method for NMF (Non-negative Matrix Factorization)
init = "nnsvd"  # Nonnegative SVD initialization

# Path to save HNMFk results
HNMFK_save_path = os.path.join(RESULTS, "example_HNMFK")
name = HNMFK_save_path  # Alias for convenience

# Parameters for HNMFk (Hierarchical Nonnegative Matrix Factorization k-search)
nmfk_params = {
    "k_search_method": "bst_pre",             # Method for determining optimal k (e.g., binary search with pre-checks)
    "sill_thresh": 0.7,                       # Silhouette threshold to accept a given k
    "H_sill_thresh": 0.05,                    # Threshold for H-matrix silhouette to refine k selection
    "n_perturbs": perts,                      # Number of perturbations
    "n_iters": iters,                         # Number of iterations per perturbation
    "epsilon": eps,                           # Perturbation strength
    "n_jobs": -1,                             # Use all available CPU cores
    "init": init,                             # NMF initialization method
    "use_gpu": False,                         # Whether to use GPU acceleration
    "save_path": HNMFK_save_path,             # Directory where results will be saved
    "predict_k_method": "WH_sill",            # Method to predict k using W and H matrix silhouettes
    "predict_k": True,                        # Whether to automatically predict k
    "verbose": False,                          # Verbose output
    "nmf_verbose": False,                     # Verbose output from NMF algorithm
    "transpose": False,                       # Whether to transpose input data
    "pruned": True,                           # Whether to prune unstable clusters
    "nmf_method": "nmf_fro_mu",               # NMF solver method (Frobenius norm, multiplicative updates)
    "calculate_error": False,                 # Whether to calculate reconstruction error
    "use_consensus_stopping": 0,              # Whether to use consensus stopping (0 = off)
    "calculate_pac": False,                   # Whether to compute PAC (proportion of ambiguous clustering)
    "consensus_mat": False,                   # Whether to generate consensus matrix
    "perturb_type": "uniform",                # Type of perturbation (e.g., uniform noise)
    "perturb_multiprocessing": False,         # Use multiprocessing during perturbation
    "perturb_verbose": False,                 # Verbose output during perturbation
    "simple_plot": True                       # Whether to generate simplified summary plots
}


In [34]:
class CustomSemanticCallback:
    def __init__(self, 
                 df: pd.DataFrame, 
                 target_column=DATA_COLUMN,
                 options={'vocabulary': vocabulary},
                 matrix_type="tfidf") -> None:
        """
        Initializes the callback with a DataFrame and matrix generation settings.

        Parameters:
        - df: The full DataFrame containing the text data.
        - target_column: Column name containing the target text to vectorize (default is a global DATA_COLUMN).
        - options: Options dictionary passed to Beaver (e.g., fixed vocabulary, token settings).
        - matrix_type: Type of vectorization matrix (e.g., "tfidf", "count").
        """
        self.df = df
        self.target_column = target_column
        self.options = options
        self.matrix_type = matrix_type

    def __call__(self, original_indices: np.ndarray):
        """
        Callable interface for dynamically generating document-term matrices 
        from a subset of the DataFrame.

        Parameters:
        - original_indices: Numpy array of row indices from self.df to subset and transform.

        Returns:
        - Tuple of (X, metadata), where:
            - X is a document-term sparse matrix (CSR format).
            - metadata is a dict containing either 'vocab' or a 'stop_reason' if failed.
        """
        current_beaver = Beaver()  # Initialize a new instance of the Beaver text vectorizer

        # Extract the subset of the DataFrame using the provided indices
        current_df = self.df.iloc[original_indices].copy()

        # Construct parameters for the Beaver vectorizer
        current_beaver_matrix_settings = {
            "dataset": current_df,
            "target_column": self.target_column,
            "options": self.options,
            "highlighting": HIGHLIGHT_WORDS,     # Global list of words to highlight
            "weights": HIGHLIGHT_WEIGHTS,        # Associated weights for highlighting
            "matrix_type": self.matrix_type,     # Type of matrix to construct (e.g., TF-IDF)
            "save_path": None                    # No file output; matrix is returned
        }

        try:
            # Attempt to generate the document-word matrix
            current_X, vocab = current_beaver.documents_words(**current_beaver_matrix_settings)
            
            # Transpose to get documents as rows (CSR format is efficient for row slicing)
            current_X = current_X.T.tocsr()
            
            return current_X, {'vocab': vocab}

        except:
            # On failure, return a 1x1 matrix to signal a stopping condition for downstream tasks
            csr_matrix = ss.csr_matrix([[1]])
            return csr_matrix, {'stop_reason': "documents_words couldn't make matrix"}

In [35]:
# Parameters for initializing and training the HNMFk model
hnmfk_params = {
    "n_nodes": 1,  # Number of root nodes to begin with (can grow as depth increases)
    
    # List of NMF parameters for the top-level (depth=0); can use different sets for different nodes
    "nmfk_params": [nmfk_params],  
    
    # Callable that generates a document-term matrix from a subset of the DataFrame (dynamic input for each node)
    "generate_X_callback": CustomSemanticCallback(df=df, options={'vocabulary': vocabulary}),
    
    "cluster_on": "H",  # Which factor matrix to use for clustering (H = document-topic)
    
    "depth": 1,  # Depth of the hierarchy; e.g., 2 means root + one layer of children
    
    "sample_thresh": 10,  # Minimum number of samples required to split/cluster a node further
    
    "K2": False,  # If True, forces all subclusters to use k=2; here we allow varying k
    
    # Range of K to try for deeper layers (children nodes)
    "Ks_deep_min": 1,
    "Ks_deep_max": 20,
    "Ks_deep_step": 1,
    
    "experiment_name": name,  # Folder/identifier for saving results and checkpoints
}

# Instantiate the HNMFk model with the above parameters
model = HNMFk(**hnmfk_params)

# Fit the model on matrix X using the specified range of Ks
# - from_checkpoint: load previously saved progress if available
# - save_checkpoint: periodically save progress for recovery or inspection
model.fit(X, Ks, from_checkpoint=False, save_checkpoint=True)

# Traverse and collect all nodes created in the hierarchical model
all_nodes = model.traverse_nodes()
print(len(all_nodes))  # Output the total number of nodes (clusters at all levels)

# Save the full trained model to a pickle file for reuse or inspection
with open(os.path.join('result_example', 'HNMFK_highlight.pkl'), 'wb') as output_file:
    pickle.dump(model, output_file)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Done
10


# Post-process with Arctic Fox

In [None]:
model.keys()

In [36]:
# Load a pre-trained HNMFk model from disk
model = HNMFk(experiment_name=os.path.join("result_example", "example_HNMFK"))
model.load_model()  # Loads model from the provided experiment_name path

# Initialize ArcticFox pipeline
# - model: the hierarchical clustering model (HNMFk)
# - embedding_model: name of the sentence embedding model used for label generation
# - clean_cols_name: column in the DataFrame containing the cleaned text input
pipeline = ArcticFox(
    model=model,
    embedding_model="SCINCL",        # Example: SCINCL embedding model fine-tuned for scientific text
    clean_cols_name=DATA_COLUMN      # The text column used for label generation and analysis
)

# Run the full ArcticFox pipeline:
# This handles hierarchical cluster labeling, statistics generation, data collection, and label propagation
pipeline.run_full_pipeline(
    vocab=vocabulary,                # Vocabulary used to guide or filter cluster content
    data_df=df,                      # Original dataset (same used in HNMFk)
    ollama_model="llama3.2:3b-instruct-fp16",  # Language model used for semantic label generation
    label_clusters=True,             # Enable automatic labeling of clusters
    generate_stats=True,             # Generate cluster-level statistics
    process_parents=True,            # Propagate labels or stats upward through the hierarchy
    skip_completed=True,             # Skip processing of nodes already labeled/stored
    label_criteria={                 # Rules to filter generated labels
        "minimum words": 2,
        "maximum words": 6
    },
    label_info={                     # Additional metadata to associate with generated labels
        "source": "Science"
    },
    number_of_labels=5               # Number of candidate labels to generate per node
)


Loading saved object state from checkpoint...
Step 1: Post-processing W/H matrix and cluster data...
Step 2: Labeling clusters with LLM...
/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_0
Using device: cuda:0


100%|██████████| 2/2 [00:00<00:00, 56.98it/s]
100%|██████████| 1/1 [00:21<00:00, 21.97s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:02<00:00,  2.77s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_1
Using device: cuda:0


100%|██████████| 1/1 [00:00<00:00, 153.63it/s]
100%|██████████| 1/1 [00:01<00:00,  1.96s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:01<00:00,  1.40s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_2
Using device: cuda:0


100%|██████████| 4/4 [00:00<00:00, 182.27it/s]
100%|██████████| 1/1 [00:02<00:00,  2.41s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:01<00:00,  1.35s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_3
Using device: cuda:0


100%|██████████| 4/4 [00:00<00:00, 169.22it/s]
100%|██████████| 1/1 [00:05<00:00,  5.40s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:01<00:00,  1.47s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_4
Using device: cuda:0


100%|██████████| 7/7 [00:00<00:00, 166.37it/s]
100%|██████████| 1/1 [00:03<00:00,  3.41s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:01<00:00,  1.86s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_5
Using device: cuda:0


100%|██████████| 5/5 [00:00<00:00, 130.69it/s]
100%|██████████| 1/1 [00:03<00:00,  3.94s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:03<00:00,  3.78s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_6
Using device: cuda:0


100%|██████████| 7/7 [00:00<00:00, 149.37it/s]
100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:01<00:00,  1.36s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_7
Using device: cuda:0


100%|██████████| 7/7 [00:00<00:00, 137.13it/s]
100%|██████████| 1/1 [00:04<00:00,  4.20s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 1/1 [00:01<00:00,  1.38s/it]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_1/Root_8
Using device: cuda:0


100%|██████████| 13/13 [00:00<00:00, 141.13it/s]
100%|██████████| 12/12 [00:42<00:00,  3.57s/it]
  0%|          | 0/12 [00:00<?, ?it/s]

Skipping cluster_id=0 because it's not in centers.
Skipping cluster_id=1 because it's not in centers.
Skipping cluster_id=2 because it's not in centers.
Skipping cluster_id=3 because it's not in centers.
Skipping cluster_id=4 because it's not in centers.
Skipping cluster_id=5 because it's not in centers.
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 58%|█████▊    | 7/12 [00:01<00:01,  4.95it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 67%|██████▋   | 8/12 [00:02<00:01,  2.48it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 75%|███████▌  | 9/12 [00:04<00:01,  1.52it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 83%|████████▎ | 10/12 [00:05<00:01,  1.21it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 92%|█████████▏| 11/12 [00:07<00:01,  1.00s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 12/12 [00:08<00:00,  1.35it/s]


/projects/SLIC/ryan/telf_internal/examples/ArcticFox/result_example/example_HNMFK/depth_0/Root
Using device: cuda:0


100%|██████████| 50/50 [00:00<00:00, 154.82it/s]
100%|██████████| 9/9 [00:25<00:00,  2.88s/it]
  0%|          | 0/9 [00:00<?, ?it/s]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 11%|█         | 1/9 [00:01<00:13,  1.64s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 22%|██▏       | 2/9 [00:03<00:11,  1.66s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 33%|███▎      | 3/9 [00:04<00:09,  1.60s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 44%|████▍     | 4/9 [00:06<00:07,  1.55s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 56%|█████▌    | 5/9 [00:07<00:06,  1.51s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 67%|██████▋   | 6/9 [00:09<00:04,  1.49s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 78%|███████▊  | 7/9 [00:10<00:02,  1.49s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


 89%|████████▉ | 8/9 [00:12<00:01,  1.48s/it]

Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu
Using device: cpu


100%|██████████| 9/9 [00:13<00:00,  1.51s/it]

Step 3: Generating Peacock visual stats...

10 out of 10 nodes processed.



