In [196]:
import os
import psutil
import numpy as np
from time import time 

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

#### SentenceTransformer

In [240]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embeddings_st(prompt_samples):
    return model.encode(prompt_samples, convert_to_numpy=True)

def filter_similar_captions(prompt_samples, get_embeddings, threshold=0.85):
    embeddings = get_embeddings(prompt_samples) 
    similarity_matrix = cosine_similarity(embeddings)
    
    seen = set()
    result = [] 
    
    for i, row in enumerate(similarity_matrix):
        similar_indices = {j for j, sim in enumerate(row) if sim > threshold}
        
        if frozenset(similar_indices) not in seen:
            seen.add(frozenset(similar_indices))
            result.append(prompt_samples[i])
            
    return result

In [241]:
test_cases = [
    [
        "the cat is sitting on the couch",
        "there is a cat on the couch", 
        "the ginger cat is sitting on top of the purple couch", 
        "cat on couch",
        "there is a cat on the couch ",
        "There is a cat on the Couch", 
        "The apple is red", 
        "There is a cat that is sitting on the couch"
    ],
    [
        "Design a 404 page with an illustration themed around space", 
        "404 page with space themed illustration", 
        "Space-themed 404 page design", 
        "Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on space"
    ],
    [
        "404 page design with space theme",
        "About us page",
        "Landing page design with a fruit theme with images of apples, bananas, and other fruits",
        "Space themed 404 page design", 
        "About us page design", 
        "Landing page design with serene theme with images of beautiful landscapes and buildings" 
    ],
    [
        "404 page design with astronauts and planets",
        "Design a 404 page design with a space theme", 
        "Space themed 404 UI design with astronauts and planets", 
        "Create an attractive 404 page design with a space theme, including illustrations of astronauts and planets",
        "Design space themed 404 page design" 
    ]
]

for test in test_cases: 
    filtered_captions = filter_similar_captions(test, get_embeddings_st)
    print("Test Captions: \n", test)
    print()
    print("Filtered Captions: \n", filtered_captions)
    print("Number of captions filtered: ", len(test) - len(filtered_captions))
    print() 
    print()


Test Captions: 
 ['the cat is sitting on the couch', 'there is a cat on the couch', 'the ginger cat is sitting on top of the purple couch', 'cat on couch', 'there is a cat on the couch ', 'There is a cat on the Couch', 'The apple is red', 'There is a cat that is sitting on the couch']

Filtered Captions: 
 ['the cat is sitting on the couch', 'there is a cat on the couch', 'the ginger cat is sitting on top of the purple couch', 'cat on couch', 'The apple is red']
Number of captions filtered:  3


Test Captions: 
 ['Design a 404 page with an illustration themed around space', '404 page with space themed illustration', 'Space-themed 404 page design', 'Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on space']

Filtered Captions: 
 ['Design a 404 page with an illustration themed around space', '404 page with space themed illustration', 'Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on 

#### Test SentenceTransformer efficiency 

In [37]:
# Testing time taken 
start = time() 
embedding = model.encode(sample_captions[0], convert_to_numpy=True)
print("Time taken to generate one embedding: ", time() - start)

start_batch = time() 
embeddings = model.encode(sample_captions, convert_to_numpy=True)
print(f"Time taken to generate {len(sample_captions)} embeddings: ", time() - start_batch)

Time taken to generate one embedding:  0.013347387313842773
Time taken to generate 8 embeddings:  0.034555673599243164


In [40]:
# Testing memory consumed
process = psutil.Process(os.getpid())

mem_before = process.memory_info().rss
get_embeddings(prompt_samples)

print(f"Memory used: {process.memory_info().rss - mem_before} bytes")

Memory used: 4096 bytes


#### Fine-tuned GPT2 embedder 

In [237]:
import torch
from transformers import GPT2Tokenizer, GPT2Model 

In [244]:
# Load model and tokenizer 
model = GPT2Model.from_pretrained(os.path.join("embedding", "model"))
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [245]:
def get_embeddings_gpt2(prompt_samples): 
    embeddings = []
    for i in prompt_samples:
        input_id = tokenizer.encode(i, return_tensors='pt')
        
        with torch.no_grad():
            output = model(input_id, output_hidden_states=True)
            hidden_states = output.hidden_states
            last_hidden_state = hidden_states[-1]
            embedding = last_hidden_state.mean(dim=1).squeeze()
            embeddings.append(embedding)

    return embeddings 

In [248]:
test_cases = [
    [
        "the cat is sitting on the couch",
        "there is a cat on the couch", 
        "the ginger cat is sitting on top of the purple couch", 
        "cat on couch",
        "there is a cat on the couch ",
        "There is a cat on the Couch", 
        "The apple is red", 
        "There is a cat that is sitting on the couch"
    ],
    [
        "Design a 404 page with an illustration themed around space", 
        "404 page with space themed illustration", 
        "Space-themed 404 page design", 
        "Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on space"
    ],
    [
        "404 page design with space theme",
        "About us page",
        "Landing page design with a fruit theme with images of apples, bananas, and other fruits",
        "Space themed 404 page design", 
        "About us page design", 
        "Landing page design with serene theme with images of beautiful landscapes and buildings" 
    ],
    [
        "404 page design with astronauts and planets",
        "Design a 404 page design with a space theme", 
        "Space themed 404 UI design with astronauts and planets", 
        "Create an attractive 404 page design with a space theme, including illustrations of astronauts and planets",
        "Design space themed 404 page design" 
    ]
]

for test in test_cases: 
    filtered_captions = filter_similar_captions(test, get_embeddings_gpt2, threshold=0.995)
    print("Test Captions: \n", test)
    print()
    print("Filtered Captions: \n", filtered_captions)
    print("Number of captions filtered: ", len(test) - len(filtered_captions))
    print() 
    print()


Test Captions: 
 ['the cat is sitting on the couch', 'there is a cat on the couch', 'the ginger cat is sitting on top of the purple couch', 'cat on couch', 'there is a cat on the couch ', 'There is a cat on the Couch', 'The apple is red', 'There is a cat that is sitting on the couch']

Filtered Captions: 
 ['the cat is sitting on the couch', 'cat on couch', 'There is a cat that is sitting on the couch']
Number of captions filtered:  5


Test Captions: 
 ['Design a 404 page with an illustration themed around space', '404 page with space themed illustration', 'Space-themed 404 page design', 'Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on space']

Filtered Captions: 
 ['Design a 404 page with an illustration themed around space']
Number of captions filtered:  3


Test Captions: 
 ['404 page design with space theme', 'About us page', 'Landing page design with a fruit theme with images of apples, bananas, and other fruits', 'Space them

In [256]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def filter_similar_captions(captions, embeddings, distance_threshold=1.0):
    embeddings_array = np.array([emb.numpy() for emb in embeddings])
    
    distances = euclidean_distances(embeddings_array)
    
    mask = np.ones_like(distances, dtype=bool)
    np.fill_diagonal(mask, 0)  
    similar_pairs = distances < distance_threshold
    
    unique_captions = []
    used_indices = set()
    
    for i in range(len(captions)):
        if i not in used_indices:
            unique_captions.append(captions[i])
            used_indices.update(np.where(similar_pairs[i])[0])
    
    return unique_captions

for test in test_cases: 
    embeddings = get_embeddings_gpt2(test) 
    filtered_captions = filter_similar_captions(test, embeddings, distance_threshold=10.0)
    print("Test Captions: \n", test)
    print()
    print("Filtered Captions: \n", filtered_captions)
    print("Number of captions filtered: ", len(test) - len(filtered_captions))
    print() 
    print()

Test Captions: 
 ['the cat is sitting on the couch', 'there is a cat on the couch', 'the ginger cat is sitting on top of the purple couch', 'cat on couch', 'there is a cat on the couch ', 'There is a cat on the Couch', 'The apple is red', 'There is a cat that is sitting on the couch']

Filtered Captions: 
 ['the cat is sitting on the couch', 'the ginger cat is sitting on top of the purple couch', 'cat on couch', 'There is a cat on the Couch', 'The apple is red', 'There is a cat that is sitting on the couch']
Number of captions filtered:  2


Test Captions: 
 ['Design a 404 page with an illustration themed around space', '404 page with space themed illustration', 'Space-themed 404 page design', 'Design a beautiful 404 page design, with black and white aesthetic, including an illustration based on space']

Filtered Captions: 
 ['Design a 404 page with an illustration themed around space', '404 page with space themed illustration', 'Space-themed 404 page design', 'Design a beautiful 404 p

#### Test Fine-tuned GPT2 efficiency

In [257]:
sample_captions = [
    "the cat is sitting on the couch",
    "there is a cat on the couch", 
    "the ginger cat is sitting on top of the purple couch", 
    "cat on couch",
    "there is a cat on the couch ",
    "There is a cat on the Couch", 
    "The apple is red", 
    "There is a cat that is sitting on the couch"
]

In [258]:
# Testing time taken 
start = time() 
embedding = get_embeddings_gpt2(sample_captions[0])
print("Time taken to generate one embedding: ", time() - start)

start_batch = time() 
embeddings = get_embeddings_gpt2(sample_captions)
print(f"Time taken to generate {len(sample_captions)} embeddings: ", time() - start_batch)

Time taken to generate one embedding:  0.935096025466919
Time taken to generate 8 embeddings:  0.3150668144226074


In [259]:
# Testing memory consumed
process = psutil.Process(os.getpid())

mem_before = process.memory_info().rss
embeddings = get_embeddings_gpt2(prompt_samples)

print(f"Memory used: {process.memory_info().rss - mem_before} bytes")

Memory used: 1470464 bytes
