In [None]:
# Creating Ai Pipeline

In [8]:
# Loading devices dictionary
from deviceMatching.getDict import * 
from deviceMatching.proccessing import load_sentence_transformer, encode_text
device_dictionary = load_devices_dictionary_easy()
from sentence_transformers import SentenceTransformer, util


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = load_sentence_transformer(model_name)

# Sample device log text
device_log = "Error 404: Resource not found during boot sequence."

# Encode the text to get a sentence embedding
embedding = encode_text(device_log, model)

print("Embedding shape:", embedding.shape)  # Should be something like (384,)
print("Embedding vector:", embedding)

Embedding shape: (384,)
Embedding vector: [-6.37365580e-02  1.53851174e-02 -2.25298647e-02 -4.91679162e-02
  7.62720332e-02  2.67352583e-03 -8.31667893e-03  1.00508314e-02
  5.59103824e-02 -5.46157099e-02  4.02013734e-02  4.17462550e-02
  1.68936141e-02  6.40096366e-02 -3.56010832e-02  3.08003020e-03
  2.26955116e-02  1.99567620e-02  8.37309733e-02  3.54791433e-02
  7.04875216e-02  6.60389289e-02  7.95279536e-03 -4.05321755e-02
  4.82406802e-02  1.60532026e-03 -5.28415777e-02  2.73375325e-02
 -7.22867064e-03 -3.67305614e-02  1.58938300e-02 -1.32654272e-02
 -1.15982601e-02 -5.16949147e-02  1.06573857e-01  6.96046576e-02
  2.90551018e-02 -1.05756298e-01 -1.89150833e-02  7.37699643e-02
  4.12815027e-02  4.00578007e-02  5.82456440e-02 -2.13031583e-02
 -5.08161820e-03 -5.19382814e-03  7.85274357e-02 -7.98774809e-02
  1.60789732e-02  1.38386255e-02 -7.89972842e-02 -2.31923889e-02
  2.62691304e-02 -8.20356458e-02  1.24809304e-02 -2.83956379e-02
  2.60540517e-03  7.77313020e-03  3.89551148e-02

In [9]:
# Define two example strings (e.g., two device logs)
string1 = "Device failed to boot due to error code 404."
string2 = "Error 404 encountered during startup; device did not boot."

# Encode the strings into embeddings (as PyTorch tensors)
embedding1 = model.encode(string1, convert_to_tensor=True)
embedding2 = model.encode(string2, convert_to_tensor=True)

# Compute the cosine similarity between the two embeddings
cosine_similarity = util.cos_sim(embedding1, embedding2)

# Print the similarity score
print("Cosine Similarity:", cosine_similarity.item())

Cosine Similarity: 0.9118934869766235


In [11]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, util

def precompute_corpus_embeddings(corpus: list[str], model: SentenceTransformer, batch_size: int = 32, multi_process: bool = False) -> torch.Tensor:
    """
    Precompute embeddings for a list of texts (corpus) using the provided SentenceTransformer model.
    
    Parameters:
        corpus (list[str]): List of input texts.
        model (SentenceTransformer): A loaded SentenceTransformer model.
        batch_size (int): Batch size for encoding (default: 32).
        multi_process (bool): If True, use multi-process encoding; otherwise, use the regular encode().
        
    Returns:
        torch.Tensor: A tensor of shape (num_texts, embedding_dim) containing the embeddings.
    """
    if multi_process:
        # Start a pool of processes (by default, uses all available GPUs/CPUs as specified)
        pool = model.start_multi_process_pool()
        # Use the multi-process encoding function.
        # Note: encode_multi_process returns a NumPy array if convert_to_numpy is True,
        # so we set convert_to_numpy=False to get a tensor.
        corpus_embeddings = model.encode_multi_process(corpus, pool, batch_size=batch_size, convert_to_numpy=False)
        model.stop_multi_process_pool(pool)
        # Ensure the result is a torch.Tensor
        corpus_embeddings = torch.tensor(corpus_embeddings)
    else:
        # Regular encoding: the built-in encode() function is optimized with batching and GPU acceleration.
        corpus_embeddings = model.encode(corpus, batch_size=batch_size, convert_to_tensor=True)
    
    return corpus_embeddings


TypeError: 'type' object is not subscriptable

In [23]:
from typing import List, Tuple
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, util

def precompute_corpus_embeddings(corpus: List[str],
                                 model: SentenceTransformer,
                                 batch_size: int = 32,
                                 multi_process: bool = False) -> torch.Tensor:
    """
    Precompute embeddings for a list of texts (corpus) using the provided SentenceTransformer model.
    
    This function is designed to efficiently process large datasets.
    - If multi_process is True, it splits the corpus into chunks and processes them in parallel using multiple processes.
    - Otherwise, it uses the built-in encode() method with batching.
    
    Parameters:
        corpus (List[str]): The list of texts to encode.
        model (SentenceTransformer): A loaded SentenceTransformer model.
        batch_size (int): The number of texts to process per batch.
        multi_process (bool): Whether to use multi-process encoding (useful for very large datasets).
        
    Returns:
        torch.Tensor: A tensor of shape (num_texts, embedding_dim) containing the embeddings.
    """
    if multi_process:
        # For very large datasets, you can leverage multiple processes.
        pool = model.start_multi_process_pool()
        # encode_multi_process processes chunks in parallel.
        corpus_embeddings = model.encode_multi_process(corpus, pool,
                                                       batch_size=batch_size,
                                                       convert_to_numpy=False)
        model.stop_multi_process_pool(pool)
        # Convert to torch.Tensor if not already
        corpus_embeddings = torch.tensor(corpus_embeddings)
    else:
        # Regular encode() uses efficient batching and GPU acceleration if available.
        corpus_embeddings = model.encode(corpus, batch_size=batch_size, convert_to_tensor=True)
    
    return corpus_embeddings


def normalize_embeddings(embeddings: torch.Tensor) -> torch.Tensor:
    """
    Normalize each embedding vector to have unit length (L2 normalization).
    
    Normalization is important because when embeddings are normalized,
    the cosine similarity between them is equivalent to their dot product.
    
    Parameters:
        embeddings (torch.Tensor): A tensor of shape (N, embedding_dim).
    
    Returns:
        torch.Tensor: A tensor of the same shape with each row normalized.
    """
    return torch.nn.functional.normalize(embeddings, p=2, dim=1)

def match_query_to_corpus(query: str,
                          corpus: List[str],
                          model: SentenceTransformer,
                          corpus_embeddings: torch.Tensor) -> Tuple[str, float]:
    """
    Encode a query string and find the most similar entry in the precomputed corpus.
    
    The function encodes the query using the model, normalizes the query embedding (to match the normalized corpus embeddings),
    computes cosine similarity with all corpus embeddings, and returns the best matching corpus text along with its score.
    
    Parameters:
        query (str): The query text (e.g., a device log).
        corpus (List[str]): The list of corpus texts (used for returning the actual text).
        model (SentenceTransformer): The model used for encoding.
        corpus_embeddings (torch.Tensor): Precomputed (and normalized) embeddings for the corpus.
    
    Returns:
        Tuple[str, float]: The best matching text from the corpus and its cosine similarity score.
    """
    # Encode the query text.
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    # Normalize the query embedding for a consistent cosine similarity computation.
    query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)
    
    # Compute cosine similarity between the query and all corpus embeddings.
    cosine_scores = util.cos_sim(query_embedding, corpus_embeddings)
    
    # Find the index with the highest similarity.
    best_idx = int(torch.argmax(cosine_scores))
    best_score = cosine_scores[0][best_idx].item()
    
    return corpus[best_idx], best_score


In [19]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

# Example corpus: this can be as large as 10k+ entries.
corpus = [
    "Device failed to boot due to error code 404.",
    "System booted normally with no issues.",
    "Warning: Low disk space detected on device.",
    "Error 404 encountered during startup; device did not boot."
]

corpus_embeddings = precompute_corpus_embeddings(corpus, model, batch_size=32, multi_process=False)


In [24]:
norm_corpus_embeddings = normalize_embeddings(corpus_embeddings)


In [26]:
# New query text (device log) to match against the corpus
query = "Error 404: Resource not found during boot sequence."

# Find the best matching corpus entry for the query.
best_match, score = match_query_to_corpus(query, corpus, model, norm_corpus_embeddings)

print("Best matching corpus entry:", best_match)
print("Cosine similarity score:", score)

Best matching corpus entry: Error 404 encountered during startup; device did not boot.
Cosine similarity score: 0.7473016381263733
