In [41]:
import os
import psutil
import numpy as np
from time import time 

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def filter_similar_captions(prompt_samples, threshold=0.85):
    embeddings = model.encode(prompt_samples, convert_to_numpy=True)
    similarity_matrix = cosine_similarity(embeddings)
    
    seen = set()
    result = [] 
    
    for i, row in enumerate(similarity_matrix):
        similar_indices = {j for j, sim in enumerate(row) if sim > threshold}
        
        if frozenset(similar_indices) not in seen:
            seen.add(frozenset(similar_indices))
            result.append(prompt_samples[i])
            
    return result

In [30]:
sample_captions = [
    "the cat is sitting on the couch",
    "there is a cat on the couch", 
    "the ginger cat is sitting on top of the purple couch", 
    "cat on couch",
    "there is a cat on the couch ",
    "There is a cat on the Couch", 
    "The apple is red", 
    "There is a cat that is sitting on the couch"
] 

sample_captions_2 = [
    "Design a 404 page with an illustration themed around space", 
    "404 page with space themed illustration", 
    "Space-themed 404 page design", 
    "Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on space"
] 

filtered_captions = filter_similar_captions(sample_captions, threshold=0.85)
print("Filtered Captions:", filtered_captions)

filtered_captions_2 = filter_similar_captions(sample_captions_2, threshold=0.85)
print("Filtered Captions:", filtered_captions_2)

Filtered Captions: ['the cat is sitting on the couch', 'there is a cat on the couch', 'the ginger cat is sitting on top of the purple couch', 'cat on couch', 'The apple is red']
Filtered Captions: ['Design a 404 page with an illustration themed around space', '404 page with space themed illustration', 'Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on space']


#### Test SentenceTransformer efficiency 

In [37]:
# Testing time taken 
start = time() 
embedding = model.encode(sample_captions[0], convert_to_numpy=True)
print("Time taken to generate one embedding: ", time() - start)

start_batch = time() 
embeddings = model.encode(sample_captions, convert_to_numpy=True)
print(f"Time taken to generate {len(sample_captions)} embeddings: ", time() - start_batch)

Time taken to generate one embedding:  0.013347387313842773
Time taken to generate 8 embeddings:  0.034555673599243164


In [40]:
# Testing memory consumed
def get_embeddings():
    return model.encode(sample_captions, convert_to_numpy=True)

process = psutil.Process(os.getpid())

mem_before = process.memory_info().rss
get_embeddings()

print(f"Memory used: {process.memory_info().rss - mem_before} bytes")

Memory used: 4096 bytes
