# Add posts and find similar ones


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from pathlib import Path

from be.data.utils import get_resource
from be.data.vector_db import VectorDB
from be.social.insta.loader import InstaLoader
from langchain_core.documents import Document
from loguru import logger as lg
from py3langid.langid import MODEL_FILE, LanguageIdentifier
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from be.social.insta.structures import PostIg

In [None]:
posts_fol = get_resource("ig_fol") / "posts"
chroma_fol = get_resource("chroma_fol") / "vector"

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
vb = VectorDB(
    persist_directory=str(chroma_fol),
    embedding_function=embedding_function,
)

In [None]:
il = InstaLoader("")

In [None]:
def post_to_hashtag_docs(post: PostIg) -> list[Document]:
    """Minimal func to convert a post to a list of hashtag documents.

    Generate a list of hashtags as the first 5 words of the caption if needed.
    """
    if len(post.caption_hashtags) == 0:
        chs = post.caption.split()[:5]
    else:
        chs = post.caption_hashtags
    doc_meta = {"shortcode": post.shortcode, "source": "caption_hashtags"}
    phs = [Document(ch, metadata=doc_meta) for ch in chs]
    return phs

In [None]:
doc_captions = []
doc_hashtags = []

posts_fol_iter = list(posts_fol.iterdir())
for ip, post_fol in enumerate(posts_fol_iter):
    # get the shortcode
    shortcode = post_fol.name
    lg.info(f"{ip}/{len(posts_fol_iter)} {post_fol} {shortcode}")

    # load the post
    ps = il.load_post(shortcode)

    # get the metadata
    doc_meta = {"shortcode": shortcode, "source": "caption"}
    # convert to document caption
    doc_captions.append(Document(page_content=ps.caption, metadata=doc_meta))
    # convert to document hashtags
    doc_hashtags.extend(post_to_hashtag_docs(ps))

    break

In [None]:
doc_hashtags[0]

In [None]:
new_ids = vb.add_documents(doc_captions)
new_ids

In [None]:
new_ids = vb.add_documents(doc_hashtags)
new_ids

In [None]:
hash_ = "5271646f9e574f0b5a5dcdf41c4e707aeff8d3b444a34e29dea277a7cd6ece05"

In [None]:
# vb.get(ids="aaa")
vb.get(ids=hash_, include=[])

In [None]:
# vb.get(
#     where={
#         "shortcode": shortcode,
#         "source": "caption_hashtags",
#     }
# )

vb.get(
    where={
        "$and": [
            {"shortcode": shortcode},
            {"source": "caption_hashtags"},
        ]
    }
)