In [4]:
# !pip install transformers torch

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

In [6]:
# # Load model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
# model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [7]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/nli-roberta-large')

In [8]:
def get_sentence_embedding(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state  # Shape: (1, sequence_length, hidden_size)
    attention_mask = inputs['attention_mask']  # Shape: (1, sequence_length)
    mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()  # Shape: (1, sequence_length, hidden_size)
    sum_embeddings = torch.sum(embeddings * mask_expanded, dim=1)  # Shape: (1, hidden_size)
    sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)  # Shape: (1, hidden_size)
    mean_pooled_embeddings = sum_embeddings / sum_mask  # Shape: (1, hidden_size)
    return mean_pooled_embeddings.squeeze(0)  # Shape: (hidden_size,)

In [9]:
# Predefined list of sentences
sentences = [
    "This sentence has a similar theme.",
    "An entirely different topic is discussed here.",
    "Here's another sentence that's somewhat related.",
    "This is the base sentence for comparison.",
    "Random words football tree monitor."
]


In [10]:
# List of 25 passages from different fields
passages = [
    'For example, I would like to enhance a new website with no code tools, such that, users not can spend more time on thinking compared to more on writing code',
    'A new medical field, to discover monkey DNA, with human research, can lead a new generation, to do this i can verify wih high end hosipitals']
  


In [11]:
# Precompute embeddings and store in a dictionary
sentence_embeddings = {}
for sentence in sentences:
    sentence_embeddings[sentence] = get_sentence_embedding(sentence, tokenizer, model)


In [12]:
# Function to find most similar sentence
def find_most_similar(new_sentence, sentence_embeddings):
    new_embedding = get_sentence_embedding(new_sentence, tokenizer, model)  # Shape: (hidden_size,)
    highest_similarity = -1
    most_similar_sentence = ""
    
    for sentence, embedding in sentence_embeddings.items():
        # Both embeddings are 1D tensors of shape (hidden_size,)
        similarity = torch.nn.functional.cosine_similarity(new_embedding, embedding, dim=0).item()
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_sentence = sentence
            
    return most_similar_sentence, highest_similarity

In [13]:
# Example new sentence
new_sentence = "Check this sentence for similarity."
most_similar, similarity_score = find_most_similar(new_sentence, sentence_embeddings)
print(f"The most similar sentence to '{new_sentence}' is '{most_similar}' with a similarity of {similarity_score:.2f}")


The most similar sentence to 'Check this sentence for similarity.' is 'This is the base sentence for comparison.' with a similarity of 0.68


In [14]:
# from transformers import AutoTokenizer, AutoModel
# import torch

# # Load model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
# model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# def get_sentence_embedding(sentence, tokenizer, model):
#     inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     embeddings = outputs.last_hidden_state  # Shape: (1, sequence_length, hidden_size)
#     attention_mask = inputs['attention_mask']  # Shape: (1, sequence_length)
#     mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()  # Shape: (1, sequence_length, hidden_size)
#     sum_embeddings = torch.sum(embeddings * mask_expanded, dim=1)  # Shape: (1, hidden_size)
#     sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)  # Shape: (1, hidden_size)
#     mean_pooled_embeddings = sum_embeddings / sum_mask  # Shape: (1, hidden_size)
#     return mean_pooled_embeddings.squeeze(0)  # Shape: (hidden_size,)

# # Predefined list of sentences
# sentences = [
#     "This sentence has a similar theme.",
#     "An entirely different topic is discussed here.",
#     "Here's another sentence that's somewhat related.",
#     "This is the base sentence for comparison.",
#     "Random words football tree monitor."
# ]

# # Precompute embeddings and store in a dictionary
# sentence_embeddings = {}
# for sentence in sentences:
#     sentence_embeddings[sentence] = get_sentence_embedding(sentence, tokenizer, model)

# # Function to find most similar sentence
# def find_most_similar(new_sentence, sentence_embeddings):
#     new_embedding = get_sentence_embedding(new_sentence, tokenizer, model)  # Shape: (hidden_size,)
#     highest_similarity = -1
#     most_similar_sentence = ""
    
#     for sentence, embedding in sentence_embeddings.items():
#         # Both embeddings are 1D tensors of shape (hidden_size,)
#         similarity = torch.nn.functional.cosine_similarity(new_embedding, embedding, dim=0).item()
#         if similarity > highest_similarity:
#             highest_similarity = similarity
#             most_similar_sentence = sentence
            
#     return most_similar_sentence, highest_similarity

# # Example new sentence
# new_sentence = "Check this sentence for similarity."
# most_similar, similarity_score = find_most_similar(new_sentence, sentence_embeddings)
# print(f"The most similar sentence to '{new_sentence}' is '{most_similar}' with a similarity of {similarity_score:.2f}")


In [15]:
# Precompute embeddings and store in a dictionary
passage_embeddings = {}
for passage in passages:
    passage_embeddings[passage] = get_sentence_embedding(passage, tokenizer, model)

In [16]:
# Function to find the top N most similar passages
def find_top_n_similar(new_text, passage_embeddings, top_n=3):
    new_embedding = get_sentence_embedding(new_text, tokenizer, model)  # Shape: (hidden_size,)
    similarities = {}
    for passage, embedding in passage_embeddings.items():
        similarity = torch.nn.functional.cosine_similarity(new_embedding, embedding, dim=0).item()
        similarities[passage] = similarity
    # Sort the passages based on similarity scores in descending order
    sorted_passages = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return sorted_passages[:top_n]

In [17]:
# Example new text
new_text = "Flowers has great smell, where fragances of top brands puts billions of money, my idea is to extract a new smell and make billions which satisfies humans rich smell sense flavours"

# Find the top 3 most similar passages
top_similar_passages = find_top_n_similar(new_text, passage_embeddings, top_n=3)

# Display the results
print(f"Top 3 passages similar to:\n'{new_text}'\n")
for i, (passage, similarity_score) in enumerate(top_similar_passages, start=1):
    formatted_similarity = f"{similarity_score * 100:.2f}%"
    print(f"{i}. Similarity: {formatted_similarity}\n   Passage: {passage}\n")

Top 3 passages similar to:
'Flowers has great smell, where fragances of top brands puts billions of money, my idea is to extract a new smell and make billions which satisfies humans rich smell sense flavours'

1. Similarity: 82.80%
   Passage: A new medical field, to discover monkey DNA, with human research, can lead a new generation, to do this i can verify wih high end hosipitals

2. Similarity: 79.84%
   Passage: For example, I would like to enhance a new website with no code tools, such that, users not can spend more time on thinking compared to more on writing code



In [18]:
passages = [
    # Astronomy
    "The Milky Way galaxy contains hundreds of billions of stars, including our sun. It is a barred spiral galaxy with distinct arms, where new stars are constantly being formed from clouds of gas and dust. Astronomers believe that at the center of our galaxy lies a supermassive black hole, which exerts a strong gravitational pull on surrounding stars.",
    
    # Medicine
    "Vaccination is a critical tool in public health, providing immunity against infectious diseases. Vaccines work by stimulating the immune system to recognize and fight pathogens like viruses and bacteria. Herd immunity can be achieved when a high percentage of the population is vaccinated, helping protect those who are unable to receive vaccines themselves.",
    
    # Artificial Intelligence
    "Artificial Intelligence (AI) refers to the simulation of human intelligence in machines programmed to think and learn. Machine learning, a subset of AI, involves feeding algorithms vast amounts of data to identify patterns and make decisions. Applications of AI are vast, ranging from natural language processing to autonomous vehicles.",
    
    # Zoology
    "Elephants are the largest land animals on Earth, known for their intelligence and strong social bonds. They communicate through a range of sounds, including low-frequency rumbles that can travel long distances. Elephants also exhibit behaviors such as mourning their dead, showcasing a level of emotional complexity rarely seen in other species.",
    
    # Economics
    "Inflation is an economic phenomenon characterized by a general rise in prices over time. It can reduce the purchasing power of money, affecting both consumers and businesses. Central banks use monetary policy tools such as interest rate adjustments to manage inflation levels, aiming for a balance between economic growth and price stability."
]


In [20]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load a more advanced pre-trained model

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# 
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
# model = AutoModel.from_pretrained('sentence-transformers/nli-roberta-large')

def get_sentence_embedding(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use mean pooling
    embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    summed = torch.sum(masked_embeddings, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    mean_pooled = summed / counts
    # Normalize the embeddings
    normalized_embeddings = torch.nn.functional.normalize(mean_pooled, p=2, dim=1)
    return normalized_embeddings.squeeze(0)

# Precompute embeddings
passage_embeddings = {}
for passage in passages:
    passage_embeddings[passage] = get_sentence_embedding(passage, tokenizer, model)

# Similarity function
def find_top_n_similar(new_text, passage_embeddings, top_n=3, min_similarity_threshold=0.):
    new_embedding = get_sentence_embedding(new_text, tokenizer, model)
    similarities = {}
    for passage, embedding in passage_embeddings.items():
        similarity = torch.nn.functional.cosine_similarity(new_embedding.unsqueeze(0), embedding.unsqueeze(0), dim=1).item()
        if similarity > min_similarity_threshold:
            similarities[passage] = similarity
    sorted_passages = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return sorted_passages[:top_n]

# Example new text
new_text = ''' Renewable energy sources, such as solar and wind, are becoming increasingly important in the global effort to reduce carbon emissions and combat climate change
 Investments in green technology are expected to grow as countries aim for sustainable development and energy independence '''



# Find top similar passages
top_similar_passages = find_top_n_similar(new_text, passage_embeddings)

# Display results
for i, (passage, similarity_score) in enumerate(top_similar_passages, start=1):
    formatted_similarity = f"{similarity_score * 100:.2f}%"
    print(f"{i}. Similarity: {formatted_similarity}\n   Passage: {passage}\n")


1. Similarity: 90.40%
   Passage: Elephants are the largest land animals on Earth, known for their intelligence and strong social bonds. They communicate through a range of sounds, including low-frequency rumbles that can travel long distances. Elephants also exhibit behaviors such as mourning their dead, showcasing a level of emotional complexity rarely seen in other species.

2. Similarity: 87.90%
   Passage: Artificial Intelligence (AI) refers to the simulation of human intelligence in machines programmed to think and learn. Machine learning, a subset of AI, involves feeding algorithms vast amounts of data to identify patterns and make decisions. Applications of AI are vast, ranging from natural language processing to autonomous vehicles.

3. Similarity: 84.40%
   Passage: Inflation is an economic phenomenon characterized by a general rise in prices over time. It can reduce the purchasing power of money, affecting both consumers and businesses. Central banks use monetary policy tool