In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

In [4]:
posts = [
    {
        "user": "Ahmed",
        "content": "Modern GPTs are tuned through RLHF to behave like humans prefer.",
        "tags": ["AI", "RLHF", "Ethics"]
    },
    {
        "user": "Sara",
        "content": "I love experimenting with transformers for NLP tasks.",
        "tags": ["NLP", "Transformers", "Deep Learning"]
    },
    {
        "user": "Omar",
        "content": "The new AI safety techniques are fascinating.",
        "tags": ["AI", "Safety", "Research"]
    }
]

In [14]:
client = SentenceTransformer("BAAI/bge-base-en-v1.5")

texts = [
    f"Tags: {', '.join(p['tags'])} | Content: {p['content']}"
    for p in posts
]

embeddings = client.encode(texts)

df = pd.DataFrame({
    "content": [p["content"] for p in posts],
    "tags": [p["tags"] for p in posts],
    "embedding": embeddings.tolist()
})

print(df.head())

                                             content  \
0  Modern GPTs are tuned through RLHF to behave l...   
1  I love experimenting with transformers for NLP...   
2      The new AI safety techniques are fascinating.   

                                 tags  \
0                  [AI, RLHF, Ethics]   
1  [NLP, Transformers, Deep Learning]   
2              [AI, Safety, Research]   

                                           embedding  
0  [0.028965046629309654, -0.007574356626719236, ...  
1  [0.0029659478459507227, -0.059286728501319885,...  
2  [0.020918486639857292, 0.029506340622901917, -...  


In [18]:
from sentence_transformers import util

sim = util.cos_sim(embeddings[0], embeddings[1])
print("Similarity between post 1 and 2:", float(sim))

Similarity between post 1 and 2: 0.5734729766845703


In [26]:
def search_posts(query, top_k=3):
    query_emb = client.encode([query]).astype("float32")
    faiss.normalize_L2(query_emb)
    distances, indices = index.search(query_emb, top_k)
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            "content": df.iloc[idx]["content"],
            "tags": df.iloc[idx]["tags"],
            "similarity": float(distances[0][i])
        })
    return results

results = search_posts("AI and safety research")
for r in results:
    print(r)


{'content': 'The new AI safety techniques are fascinating.', 'tags': ['AI', 'Safety', 'Research'], 'similarity': 0.8342117071151733}
{'content': 'I love experimenting with transformers for NLP tasks.', 'tags': ['NLP', 'Transformers', 'Deep Learning'], 'similarity': 0.595565676689148}
{'content': 'Modern GPTs are tuned through RLHF to behave like humans prefer.', 'tags': ['AI', 'RLHF', 'Ethics'], 'similarity': 0.568821370601654}
