In [1]:
!pip install -q chromadb openai python-dotenv

In [2]:
!pip install pypdf



In [3]:
import os
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from transformers import pipeline
from google.colab import userdata
from openai import OpenAI


In [4]:
# Method 1: via environment variable
os.environ["CHATGPTKEY"] = userdata.get("CHATGPTKEY")

# Initialize OpenAI client
client = OpenAI(api_key=os.environ["CHATGPTKEY"])

In [5]:
def openai_chat_completion(prompt: str, model="gpt-4o-mini"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system"},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=512,
    )
    return response.choices[0].message.content.strip()

In [124]:
def load_multiple_pdfs(folder_path: str):
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""], chunk_size=2000, chunk_overlap=20
    )
    all_chunks, metadata_list = [], []
    file_list = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]

    for file_name in file_list:
        reader = PdfReader(os.path.join(folder_path, file_name))
        for page_num, page in enumerate(reader.pages):
            raw_text = page.extract_text()
            if not raw_text:
                continue
            chunks = splitter.split_text(raw_text)
            for chunk in chunks:
                all_chunks.append(chunk)
                metadata_list.append({
                    "source": file_name,
                    "page": page_num + 1,
                })
    return all_chunks, metadata_list


In [107]:
def create_vector_db_with_metadata(docs, metadata, collection_name="multi-pdf-collection"):
    embedding_function = SentenceTransformerEmbeddingFunction()
    client = chromadb.Client()
    collection = client.create_collection(
        name=collection_name, embedding_function=embedding_function, get_or_create=True
    )
    ids = [str(i) for i in range(len(docs))]
    collection.add(ids=ids, documents=docs, metadatas=metadata)
    return collection

In [108]:
def augment_query(query: str):
    prompt = f"""You are a dedicated Problem Solving and Competitive Programming Assistant. Your goal is to help users become better problem solvers and competitive programmers. When a user asks you for a roadmap or guidance on a specific topic, your task is to:

Identify their current skill level and goals (if not provided, ask them briefly).

Provide a structured, personalized learning roadmap for the topic or goal they mention.

Include key resources, recommended problems, and platforms (e.g., Codeforces, LeetCode, AtCoder).

Highlight common weaknesses related to the topic and how to overcome them.

Suggest daily or weekly practice strategies to build consistency and long-term improvement.

Always focus on guiding the user toward their next steps with clarity, encouragement, and technical accuracy.

 :That is the user's query: {query}"""
    output = openai_chat_completion(prompt)
    return [line.strip("-• ").strip() for line in output.split("\n") if line.strip()]

In [109]:
def retrieve_documents_with_metadata(collection, queries, n_results=5):
    results = collection.query(query_texts=queries, n_results=n_results, include=["documents", "metadatas"])
    combined = []
    for docs, metas in zip(results["documents"], results["metadatas"]):
        for doc, meta in zip(docs, metas):
            combined.append({"content": doc, "source": meta.get("source", "unknown"), "page": meta.get("page", "?")})
    seen, unique = set(), []
    for item in combined:
        if item["content"] not in seen:
            seen.add(item["content"])
            unique.append(item)
    return unique


In [110]:
def generate_answer_with_memory(query, retrieved_docs, history=None):
    context = "\n\n".join(
        f"[{doc['source']}, page {doc['page']}] {doc['content']}" for doc in retrieved_docs
    )
    history_text = ""
    if history:
        for i, turn in enumerate(history):
            role = "User" if i % 2 == 0 else "Assistant"
            history_text += f"{role}: {turn}\n"

    full_prompt = (
        f"{history_text}\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    )
    return openai_chat_completion(full_prompt)

In [131]:
chunks, metadata = load_multiple_pdfs("/content/data")




In [133]:
collection = create_vector_db_with_metadata(chunks, metadata)


In [134]:
user_query = "what is the objective of norhan's project"

history = [
]

augmented = augment_query(user_query)


In [135]:
queries = [user_query] + augmented



In [136]:
retrieved = retrieve_documents_with_metadata(collection, queries)


In [137]:
answer = generate_answer_with_memory(user_query, retrieved, history)
print("🤖 Answer:\n", answer)

🤖 Answer:
 The objective of Norhan's project is to develop a system that assists users, particularly women, in identifying their body types and providing personalized clothing recommendations based on their individual characteristics. The project aims to enhance the shopping experience by offering suggestions for styles that suit different body types, taking into account various factors such as skin color, accessories, and specific needs (e.g., for pregnant women or individuals of varying heights). Additionally, the project seeks to minimize user effort in taking measurements by enabling the system to extract measurements from a scanned picture of the user. Ultimately, the goal is to help users visualize how clothes will look on them by incorporating features like a 3D model of their body wearing selected clothing items. The project also intends to expand its capabilities to include recommendations for men and children.
