In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
data_for_recommendation = [
    {"text": "how to maintain health", "label": 0},
    {"text": "how to protect health", "label": 1},
    {"text": "factors affecting health", "label": 2},
    {"text": "bad factors for health", "label": 3},
    {"text": "good factors for health", "label": 4},
    {"text": "preventing shutdown in health", "label": 5},
    {"text": "optimizing health performance", "label": 6},
    {"text": "extending lifespan in health", "label": 7},
    {"text": "enhancing health efficiency", "label": 8},
    {"text": "key parameters to monitor in health", "label": 9}]

### Conceptual similarity

In [24]:
# Load a pre-trained BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
df = pd.DataFrame(data_for_recommendation)

# Embed the texts
embeddings = model.encode(df['text'], convert_to_tensor=True)

In [25]:
  # Function to calculate similarity
def calculate_similarity(query, embeddings):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    return cos_scores

In [26]:
# Example: Find similar texts for a given query
def input_text_model(input_text):
    """_summary_

    Args:
        input_text (_type_): _description_

    Returns:
        _type_: _description_
    """
    
    input_sentence_processed = ''.join([i for i in input_text if not i.isdigit()])
    query = input_sentence_processed
    query_similarity = calculate_similarity(query, embeddings)
    top_n = 10
# Convert PyTorch tensor to a NumPy array and move to CPU if necessary
    # the data_for_recommendation that we have provided will be given a score based on their similarity
    # That matrix will be sorted in ascending order using argsort() 
    # and the label of the one with the highest score will be printed -> similar_texts[-1] (printing the last value)
    top_indices = query_similarity.argsort().cpu().numpy()[-top_n:]
    similar_texts = df.loc[top_indices, 'label'].tolist()
    label=similar_texts[-1]
    return label

In [27]:
label=input_text_model("Tips for preserving health?")
print(label)

0


#### Cosine similarity

In [30]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

In [31]:
# Load BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Tokenize and encode the dataset for recommendation function only
tokenized_data_recommendation = tokenizer(
    [item["text"] for item in data_for_recommendation],
    padding=True,
    truncation=True,
    return_tensors="pt"
)


In [33]:
def input_text_model(input_text):
    """Predict label for input text"""
    # Process input text
      
    # Tokenize the input sentence
    tokenized_input = tokenizer(
        input_text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    # Get the embeddings for the input sentence
    with torch.no_grad():
        model_output = model(**tokenized_input)
        input_embeddings = model_output.logits.detach().numpy()
    
    # Get the embeddings for sentences in both datasets
    embeddings_recommendation = model(**tokenized_data_recommendation).logits.detach().numpy()

    # Calculate cosine similarity
    similarity_recommendation = cosine_similarity(input_embeddings, embeddings_recommendation)

    # Determine the label based on higher cosine similarity
    most_similar_index_recommendation = np.argmax(similarity_recommendation)
    label_recommendation = data_for_recommendation[most_similar_index_recommendation]["label"]
    return label_recommendation

In [34]:
# Sample usage

label = input_text_model("Tips for preserving health?")
print(label)

0
