In [1]:
import pandas as pd

df = pd.read_csv('../data/diffusion_prompts.csv')

df = df.drop_duplicates(subset='prompt', keep='first').iloc[:100, :]

df.head(3)

Unnamed: 0.1,Unnamed: 0,id,prompt,url,width,height,source_site
0,0,00000d0e-45cb-47b6-9f72-6a481e940d78,"man waking up, dark and still room, cinematic ...",https://krea-prod-v1-generations.s3.us-east-1....,512,512,stablediffusionweb.com
1,1,00001a8f-993f-4d69-8fd2-f7d69dc1e8ef,Yate con familia feliz navegando por el mar ca...,https://image.lexica.art/full_jpg/00001a8f-993...,640,640,lexica.art
2,2,00002cfc-8170-4a93-a1f8-aa5681cb5f71,"Many friendly alien race individuals. fantasy,...",https://image.lexica.art/full_jpg/00002cfc-817...,512,768,lexica.art


In [23]:
df.shape

(100, 7)

In [24]:
from langchain.schema import Document

documents = []
for index, row in df.iterrows():
    page_content = row['prompt']
    metadata = {
        'url': row['url'],
        'source_site': row['source_site']
    }
    
    document = Document(
        page_content=page_content,
        metadata=metadata
    )
    
    documents.append(document)

In [25]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

from utils.Embeddings import EmbeddingModel

embeddings = EmbeddingModel()
db = FAISS.from_documents(documents, embeddings)
print(db.index.ntotal)

100


In [26]:
query = "man"
docs = db.similarity_search(query)

print(docs[0].page_content)

Caucasian male, European male, wearing gold ornaments, looking into camera, photo realistic


In [27]:
retriever = db.as_retriever()
docs = retriever.invoke(query)

print(docs[0])

page_content='Caucasian male, European male, wearing gold ornaments, looking into camera, photo realistic' metadata={'url': 'https://image.lexica.art/full_jpg/8c0f02fb-eb82-4c2d-939d-69ab0eb89967', 'source_site': 'lexica.art'}


In [28]:
db.save_local("../data/prompt_index")