In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data_for_recommendation = [
    {"text": "how to maintain health", "label": 0},
    {"text": "how to protect health", "label": 1},
    {"text": "factors affecting health", "label": 2},
    {"text": "bad factors for health", "label": 3},
    {"text": "good factors for health", "label": 4},
    {"text": "preventing shutdown in health", "label": 5},
    {"text": "optimizing health performance", "label": 6},
    {"text": "extending lifespan in health", "label": 7},
    {"text": "enhancing health efficiency", "label": 8},
    {"text": "key parameters to monitor in health", "label": 9}]

In [2]:
### Conceptual similarity

In [None]:
# Load a pre-trained BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
df = pd.DataFrame(data_for_recommendation)

# Embed the texts
embeddings = model.encode(df['text'], convert_to_tensor=True)

In [None]:
# Function to calculate similarity
def calculate_similarity(query, embeddings):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    return cos_scores

In [None]:
# Example: Find similar texts for a given query
def input_text_model(input_text):
    """_summary_

    Args:
        input_text (_type_): _description_

    Returns:
        _type_: _description_
    """
    
    input_sentence_processed = ''.join([i for i in input_text if not i.isdigit()])
    query = input_sentence_processed
    query_similarity = calculate_similarity(query, embeddings)
    top_n = 10
# Convert PyTorch tensor to a NumPy array and move to CPU if necessary
    top_indices = query_similarity.argsort().cpu().numpy()[-top_n:]
    similar_texts = df.loc[top_indices, 'label'].tolist()
    label=similar_texts[-1]
    return label

In [None]:
label=input_text_model("Tips for preserving health?")
print(label)

In [None]:
#### Cosine similarity

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np


# Load BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and encode the dataset for recommendation function only
tokenized_data_recommendation = tokenizer(
    [item["text"] for item in data_for_recommendation],
    padding=True,
    truncation=True,
    return_tensors="pt"
)

def input_text_model(input_text):
    """Predict label for input text"""
    # Process input text
      
    # Tokenize the input sentence
    tokenized_input = tokenizer(
        input_text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    # Get the embeddings for the input sentence
    with torch.no_grad():
        model_output = model(**tokenized_input)
        input_embeddings = model_output.logits.detach().numpy()
    
    # Get the embeddings for sentences in both datasets
    embeddings_recommendation = model(**tokenized_data_recommendation).logits.detach().numpy()

    # Calculate cosine similarity
    similarity_recommendation = cosine_similarity(input_embeddings, embeddings_recommendation)

    # Determine the label based on higher cosine similarity
    most_similar_index_recommendation = np.argmax(similarity_recommendation)
    label_recommendation = data_for_recommendation[most_similar_index_recommendation]["label"]
    return label_recommendation

# Sample usage

label = input_text_model("what is the good kiln temperature?")
print(label)


In [None]:
The two code snippets you provided perform similar tasks of predicting labels or categories based on input text. However, they use different models and libraries for text embedding and similarity calculation.

First Code Snippet:

Uses the BERT model (BertForSequenceClassification) from the Hugging Face Transformers library.
Tokenizes the input text and encodes it using the BERT tokenizer.
Calculates the embeddings for both the input text and the texts in the dataset using the BERT model.
Measures similarity between the input embeddings and the dataset embeddings using cosine similarity.
Determines the label or category based on the highest cosine similarity score.
Relies on the cosine_similarity function from scikit-learn for similarity calculation.
Second Code Snippet:

Uses the Sentence Transformer library.
Loads a pre-trained Sentence Transformer model (paraphrase-MiniLM-L6-v2).
Embeds both the input text and the texts in the dataset using the Sentence Transformer model.
Calculates similarity scores between the input embedding and the dataset embeddings using cosine similarity.
Determines the label or category based on the most similar texts.
Uses functions provided by the Sentence Transformer library (encode and pytorch_cos_sim) for embedding and similarity calculation.
In summary, both snippets achieve similar functionality but use different libraries and models for text embedding and similarity calculation. The choice between them depends on factors such as model preference, performance requirements, and ease of integration with existing codebases.