# Add posts and find similar ones


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from pathlib import Path

from be.data.utils import get_resource
from be.data.vector_db import VectorDB
from be.social.insta.loader import InstaLoader
from langchain_core.documents import Document
from loguru import logger as lg
from py3langid.langid import MODEL_FILE, LanguageIdentifier
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from be.social.insta.structures import PostIg

In [None]:
posts_fol = get_resource("ig_fol") / "posts"
chroma_fol = get_resource("chroma_fol") / "vector"

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
vb = VectorDB(
    persist_directory=str(chroma_fol),
    embedding_function=embedding_function,
)

In [None]:
il = InstaLoader("")

In [None]:
def post_to_hashtag_docs(post: PostIg) -> list[Document]:
    """Minimal func to convert a post to a list of hashtag documents.

    Generate a list of hashtags as the first 5 words of the caption if needed.
    """
    if len(post.caption_hashtags) == 0:
        chs = post.caption.split()[:5]
    else:
        chs = post.caption_hashtags
    unique_chs = list(set(chs))
    doc_meta = {"shortcode": post.shortcode, "source": "caption_hashtags"}
    phs = [Document(ch, metadata=doc_meta) for ch in unique_chs]
    return phs

In [None]:
doc_captions = []
doc_hashtags = []

posts_fol_iter = list(posts_fol.iterdir())
for ip, post_fol in enumerate(posts_fol_iter):
    # get the shortcode
    shortcode = post_fol.name
    # lg.info(f"{ip}/{len(posts_fol_iter)} {post_fol} {shortcode}")

    # load the post
    ps = il.load_post(shortcode)

    # get the metadata
    doc_meta = {"shortcode": shortcode, "source": "caption"}
    # convert to document caption
    doc_captions.append(Document(page_content=ps.caption, metadata=doc_meta))
    # convert to document hashtags
    doc_hashtags.extend(post_to_hashtag_docs(ps))

    # break

lg.info(f"Loaded {len(doc_captions)} captions and {len(doc_hashtags)} hashtags.")

In [None]:
doc_hashtags[0]

In [None]:
new_ids = vb.add_documents(doc_captions)
# new_ids
len(new_ids)

In [None]:
new_ids = vb.add_documents(doc_hashtags)
# new_ids
len(new_ids)

In [None]:
hash_ = "5271646f9e574f0b5a5dcdf41c4e707aeff8d3b444a34e29dea277a7cd6ece05"

In [None]:
# vb.get(ids="aaa")
vb.get(ids=hash_, include=[])

In [None]:
# vb.get(
#     where={
#         "shortcode": shortcode,
#         "source": "caption_hashtags",
#     }
# )

vb.get(
    where={
        "$and": [
            {"shortcode": shortcode},
            {"source": "caption_hashtags"},
        ]
    }
)

## Find similar recipes


In [None]:
query = "I would like to eat some eggplant based food."

### Using caption


In [None]:
similar = vb.search(
    query,
    search_type="similarity",
    filter={"source": "caption"},
)

In [None]:
similar

In [None]:
similar_with_score = vb.similarity_search_with_score(
    query,
    filter={"source": "caption"},
)
similar_with_score

### Using hashtags


In [None]:
ps

In [None]:
# should not be this shortcode
# just to test the filter
shortcode = ps.shortcode

similar_with_score = vb.similarity_search_with_score(
    query,
    filter={
        "$and": [
            {"source": "caption_hashtags"},
            {"shortcode": {"$ne": shortcode}},
        ],
    },
)
similar_with_score

In [None]:
# should be similar to these hashtags
query_chs = ps.caption_hashtags
query_chs

In [None]:
from collections import Counter

cond = {
    "$and": [
        {"source": "caption_hashtags"},
        {"shortcode": {"$ne": shortcode}},
    ],
}

counter = Counter()

for ch in query_chs:
    similar_with_score = vb.similarity_search_with_score(ch, filter=cond)
    lg.debug(f"{ch}")
    for sws in similar_with_score:
        lg.debug(sws)
    upd = {doc.metadata["shortcode"]: 1 - score for doc, score in similar_with_score}
    counter.update(upd)
    # break

In [None]:
# upd
counter.most_common(5)

In [None]:
ps_similar = [il.load_post(shortcode) for shortcode, _ in counter.most_common(5)]
lg.debug(ps_similar[1].caption)

### Using both


In [None]:
long_query = """I would like to eat some eggplant based food.
I am looking for a recipe that is easy to make and delicious.
I would like to eat some high protein food."""

long_query = """I would like to eat some fish based food.
I am looking for a recipe that is easy to make and delicious.
I would like to eat some food that is low in sugar."""


long_query = """I would like to eat some meat based dish.
I am looking for a recipe that is easy to make and delicious.
I am not a vegetarian."""
# Chickpea and Sweet Potato Curry <- this is a bad match

In [None]:
# use the similarity search using chunks of the query

# split the query into chunks
#   - split by new line
#   - split by sentence
#   - split by punctuation

query_chunks = long_query.split("\n")
query_chunks

In [None]:
# use the captions
cond = {"source": "caption"}

counter = Counter()

for chunk in query_chunks:
    similar_with_score = vb.similarity_search_with_score(chunk, filter=cond)
    lg.debug(f"{chunk}")
    for sws in similar_with_score:
        doc, score = sws
        caption_begin = doc.page_content[:100].replace("\n", "")
        lg.debug(f"{score:.2f} {caption_begin}... ")
    upd = {doc.metadata["shortcode"]: score for doc, score in similar_with_score}
    counter.update(upd)
    # cmc = counter.most_common(5)
    # lg.debug(cmc)

cmc = counter.most_common(5)
lg.info(cmc)
ps_similar = [il.load_post(shortcode) for shortcode, _ in cmc]
lg.info(ps_similar[1].caption)
# ps_similar[1].caption_hashtags

In [None]:
counter.most_common(5)

In [None]:
# the trick is probably in the negative filter
# how to recognize that an adjective is a negative one
# eg "high protein" vs "low protein"

# also we should probably normalize the results when using hashtags