In [1]:
!pip install torch torchvision ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git

  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git 'C:\Users\singh\AppData\Local\Temp\pip-req-build-pxmjn_vy'



  Cloning https://github.com/openai/CLIP.git to c:\users\singh\appdata\local\temp\pip-req-build-pxmjn_vy
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


In [None]:
import os
import uuid
import pandas as pd

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from groq import Groq

In [6]:
# =========================
# SETUP
# =========================

encoder = SentenceTransformer("all-MiniLM-L6-v2")
qdrant = QdrantClient(":memory:")

os.environ["GROQ_API_KEY"] = "FROQ_API_KEY"
llm = Groq(api_key=os.getenv("GROQ_API_KEY"))

TEXT_COLLECTION = "study_text"
IMAGE_COLLECTION = "study_images"

In [7]:
# =========================
# COLLECTIONS
# =========================

existing = [c.name for c in qdrant.get_collections().collections]

if TEXT_COLLECTION not in existing:
    qdrant.create_collection(
        collection_name=TEXT_COLLECTION,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
    )

if IMAGE_COLLECTION not in existing:
    qdrant.create_collection(
        collection_name=IMAGE_COLLECTION,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
    )

print("‚úÖ All collections ready")

‚úÖ All collections ready


In [8]:
# =========================
# IMAGE INGESTION
# =========================

df_img = pd.read_csv("query.csv")
img_points = []

for _, row in df_img.iterrows():
    vec = encoder.encode(
        f"chemical structure of {row['compoundLabel']}"
    ).tolist()

    img_points.append(
        PointStruct(
            id=str(uuid.uuid4()),
            vector=vec,
            payload={
                "compound_name": row["compoundLabel"].lower(),
                "image": row["image"]
            }
        )
    )

qdrant.upsert(IMAGE_COLLECTION, img_points)
print(f"‚úÖ {len(img_points)} images ingested")

‚úÖ 986 images ingested


In [9]:
# =========================
# TEXT INGESTION
# =========================

df_text = pd.read_csv("chemistry3.csv")
txt_points = []

for _, row in df_text.iterrows():
    if not row["compoundLabel"] or not row["article"]:
        continue

    vec = encoder.encode(row["article"]).tolist()

    txt_points.append(
        PointStruct(
            id=str(uuid.uuid4()),
            vector=vec,
            payload={
                "compound_name": row["compoundLabel"].lower(),
                "features": row["article"]
            }
        )
    )

qdrant.upsert(TEXT_COLLECTION, txt_points)
print(f"‚úÖ {len(txt_points)} text entries ingested")

‚úÖ 818 text entries ingested


In [10]:
# =========================
# MEMORY
# =========================

USER_MEMORY = []
MAX_MEMORY = 5

def store_user_memory(q):
    q = q.lower().strip()
    if USER_MEMORY and USER_MEMORY[-1] == q:
        return
    USER_MEMORY.append(q)
    if len(USER_MEMORY) > MAX_MEMORY:
        USER_MEMORY.pop(0)

def get_user_memory():
    return USER_MEMORY[::-1]

In [27]:
# =========================
# SEARCH FUNCTIONS
# =========================

def search_by_text(query, top_k=1):
    hits = qdrant.query_points(
        TEXT_COLLECTION,
        query=encoder.encode(query).tolist(),
        limit=top_k,
        with_payload=True
    ).points

    return [{
        "compound": h.payload["compound_name"],
        "text": h.payload["features"],
        "score": h.score
    } for h in hits]


def retrieve_images(query, top_k=3):
    hits = qdrant.query_points(
        IMAGE_COLLECTION,
        query=encoder.encode(query).tolist(),
        limit=top_k * 3,
        with_payload=True
    ).points

    seen = {}
    for h in hits:
        name = h.payload["compound_name"]
        if name not in seen or h.score > seen[name]["score"]:
            seen[name] = {
                "compound": name,
                "image": h.payload["image"],
                "score": round(h.score, 3)
            }

    return list(seen.values())[:top_k]


def filter_image_text_intersection(images):
    valid = []
    for img in images:
        hits = qdrant.query_points(
            TEXT_COLLECTION,
            query=encoder.encode(img["compound"]).tolist(),
            limit=1,
            with_payload=True
        ).points

        if hits:
            valid.append({
                "compound": img["compound"],
                "image": img["image"],
                "image_score": img["score"],
                "text": hits[0].payload["features"],
                "text_score": hits[0].score
            })

    return valid


def choose_best_compound(valid):
    return max(valid, key=lambda x: x["image_score"] + x["text_score"])


def recommend_related_compounds(compound, top_k=3):
    hits = qdrant.query_points(
        TEXT_COLLECTION,
        query=encoder.encode(compound).tolist(),
        limit=top_k + 3,
        with_payload=True
    ).points

    out, seen = [], set()
    for h in hits:
        name = h.payload["compound_name"]
        if name != compound and name not in seen:
            seen.add(name)
            out.append(name)
        if len(out) == top_k:
            break

    return out


In [29]:
SIMILARITY_THRESHOLD = 0.65


In [31]:
# =========================
# RAG
# =========================

def rag_answer(query, chosen, memory):
    prompt = f"""
You are a chemistry assistant.

Answer ONLY using the information below.

Compound: {chosen['compound']}
Description:
{chosen['text']}

User History:
{', '.join(memory)}

Question:
{query}
"""
    res = llm.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}]
    )
    return res.choices[0].message.content


In [49]:
query = "what is aldehyde?"

store_user_memory(f"user asked about: {query}")
memory = get_user_memory()

print("üß† MEMORY USED:", memory)

text_hits = search_by_text(query, top_k=1)

if text_hits and text_hits[0]["score"] > SIMILARITY_THRESHOLD:
    resolved_compound = text_hits[0]["compound"]
else:
    resolved_compound = None

print("Resolved compound:", resolved_compound)

# ==============================
# ANSWER GENERATION
# ==============================

if resolved_compound:
    images = retrieve_images(resolved_compound, top_k=3)
    valid = filter_image_text_intersection(images)

    if valid:
        chosen = choose_best_compound(valid)
        answer = rag_answer(query, chosen, memory)
        base_for_recommendation = chosen["compound"]
    else:
        answer = llm.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": f"Explain {query} in general chemistry terms."}]
        ).choices[0].message.content
        images = []
        base_for_recommendation = query
else:
    answer = llm.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": f"Explain {query} in general chemistry terms."}]
    ).choices[0].message.content
    images = []
    base_for_recommendation = query


# ==============================
# üî• RECOMMENDATION ALWAYS RUNS
# ==============================

recs = recommend_related_compounds(base_for_recommendation)

if recs:
    answer += "\n\nüìå Recommended Next Topics:\n"
    for r in recs:
        answer += f"- {r}\n"

print("\nüß† FINAL ANSWER:\n", answer)

if images:
    print("\nüñºÔ∏è TOP IMAGE MATCHES:")
    for img in images:
        print(f"- {img['compound']} | score={img['score']}")
        print(f"  Image: {img['image']}")


üß† MEMORY USED: ['user asked about: what is aldehyde?', 'user asked about: what is water?', 'user asked about: what is ibuprofen?', 'user asked about: what is nitrogen?', 'user asked about: what is amino acids?']
Resolved compound: None

üß† FINAL ANSWER:
 In general chemistry terms, an aldehyde is a type of organic compound that contains a functional group called a carbonyl group (-C=O) at the end of a carbon chain. The general structure of an aldehyde is R-CHO, where R is a hydrocarbon group (such as an alkyl or aryl group) and CHO is the aldehyde functional group.

The key characteristics of an aldehyde are:

1. The carbonyl group (-C=O) is bonded to a hydrogen atom (H) and a carbon atom (R).
2. The carbonyl group is located at the end of the carbon chain, meaning that there is no other carbon atom attached to the carbonyl carbon.
3. The aldehyde functional group is polar, meaning that it has a partial positive charge on the carbon atom and a partial negative charge on the oxygen