In [None]:
# Step 1: Install RAG prototype dependencies
# Run this cell in your Jupyter notebook. It may take 1-3 minutes.

import sys
!{sys.executable} -m pip install --upgrade pip

# Core libs
!{sys.executable} -m pip install "langchain>=0.0.200" langchain-openai langchain-neo4j neo4j neo4j-driver

# Document loading and PDF parsing
!{sys.executable} -m pip install pypdf langchain-io

# Optional: community loaders (if needed)
!{sys.executable} -m pip install langchain-community

# Embeddings & FAISS (cpu)
!{sys.executable} -m pip install openai faiss-cpu

# Small convenience libs
!{sys.executable} -m pip install python-dotenv

print("Install commands finished. Now running quick import tests...")




In [6]:
# Step 1b: Quick import test to confirm everything installed
errors = {}
try:
    import langchain
except Exception as e:
    errors['langchain'] = str(e)
try:
    import neo4j
except Exception as e:
    errors['neo4j'] = str(e)
try:
    import pypdf
except Exception as e:
    errors['pypdf'] = str(e)
try:
    import openai
except Exception as e:
    errors['openai'] = str(e)
try:
    import faiss
except Exception as e:
    errors['faiss'] = str(e)

if errors:
    print("IMPORT ERRORS:", errors)
else:
    print("All imports OK ✅ - environment ready for RAG prototype.")


All imports OK ✅ - environment ready for RAG prototype.


In [8]:
# Step 4 — Test Neo4j Aura connection

NEO4J_URI = "neo4j+s://d333bb5f.databases.neo4j.io"   # ← replace
NEO4J_USER = "neo4j"                                  # ← replace if different
NEO4J_PASSWORD = "ULBfmPb5dsCbkEjzQpdfvGroJvnW28MFDXYdGC6p9m4"                 # ← replace

from neo4j import GraphDatabase, basic_auth
import traceback

driver = None
try:
    # IMPORTANT: removed encrypted=True for Aura
    driver = GraphDatabase.driver(
        NEO4J_URI,
        auth=basic_auth(NEO4J_USER, NEO4J_PASSWORD)
    )

    with driver.session() as session:
        info = session.run(
            "CALL dbms.components() YIELD name, versions, edition RETURN name, versions, edition"
        ).single()

        print("DB Info:", dict(info))

        # Test writing
        session.run("CREATE (t:TestNode {created: datetime()})")
        count = session.run("MATCH (t:TestNode) RETURN count(t) AS c").single().get("c")
        print("TestNode count:", count)

        # Cleanup
        session.run("MATCH (t:TestNode) DETACH DELETE t")
        print("Cleanup complete.")

    print("\nConnected to Neo4j Aura successfully! ✅")
except Exception as e:
    print("❌ Connection failed")
    traceback.print_exc()
finally:
    if driver:
        driver.close()


  warn(


DB Info: {'name': 'Neo4j Kernel', 'versions': ['5.27-aura'], 'edition': 'enterprise'}
TestNode count: 1
Cleanup complete.

Connected to Neo4j Aura successfully! ✅


In [9]:
import os

print("Files in current directory:")
for f in os.listdir('.'):
    print(" -", f)


Files in current directory:
 - .ipynb_checkpoints
 - Rag Chat Bot.ipynb
 - Web Dev using AI Course Content.pdf


In [11]:
%%writefile .env
OPENAI_API_KEY=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
NEO4J_URI=neo4j+s://d333bb5f.databases.neo4j.io
NEO4J_USER=neo4j
NEO4J_PASSWORD=ULBfmPb5dsCbkEjzQpdfvGroJvnW28MFDXYdGC6p9m4


Overwriting .env


In [12]:
from dotenv import load_dotenv
load_dotenv()

import os
print("Loaded OPENAI_API_KEY:", "OPENAI_API_KEY" in os.environ)
print("Neo4j URI:", os.environ.get("NEO4J_URI"))


Loaded OPENAI_API_KEY: True
Neo4j URI: neo4j+s://d333bb5f.databases.neo4j.io


In [15]:
!pip install langchain-text-splitters




In [16]:
# Ingest PDF -> chunks -> embeddings -> FAISS -> Neo4j
import os, uuid, json, traceback
from pathlib import Path
import numpy as np
from neo4j import GraphDatabase, basic_auth

# config
PDF_FILENAME = "Web Dev using AI Course Content.pdf"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 200
EMBEDDING_MODEL = "text-embedding-3-small"
FAISS_DIR = "faiss_store"
BATCH = 50

# creds
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
NEO4J_URI = os.environ.get("NEO4J_URI")
NEO4J_USER = os.environ.get("NEO4J_USER")
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")

if not (OPENAI_API_KEY and NEO4J_URI and NEO4J_USER and NEO4J_PASSWORD):
    raise RuntimeError("Missing credentials. Run load_dotenv().")

# --- IMPORTS (updated for LC 0.1+) ---
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import faiss

print("Loading PDF...")
loader = PyPDFLoader(PDF_FILENAME)
pages = loader.load()
full_text = "\n\n".join([p.page_content for p in pages])
print(f"Extracted {len(pages)} pages; total chars: {len(full_text)}")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)
chunks = splitter.split_text(full_text)
print(f"Split into {len(chunks)} chunks")

docs = []
for i, txt in enumerate(chunks):
    docs.append((txt, {
        "chunk_index": i,
        "chunk_id": str(uuid.uuid4()),
        "source_pdf": PDF_FILENAME
    }))

print("Creating embeddings (OpenAI)...")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
emb = OpenAIEmbeddings(model=EMBEDDING_MODEL)
vectors = emb.embed_documents([t for t,_ in docs])
dim = len(vectors[0])
print(f"Created {len(vectors)} embeddings (dim={dim})")

# build FAISS
xb = np.array(vectors).astype("float32")
index = faiss.IndexFlatL2(dim)
index.add(xb)

os.makedirs(FAISS_DIR, exist_ok=True)
faiss.write_index(index, os.path.join(FAISS_DIR, "index.faiss"))

with open(os.path.join(FAISS_DIR, "metadata.json"), "w", encoding="utf-8") as f:
    json.dump([m for _,m in docs], f, indent=2)

print("FAISS index saved.")

# upload to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=basic_auth(NEO4J_USER, NEO4J_PASSWORD))
created = 0

print("Writing chunks to Neo4j...")
try:
    with driver.session() as session:
        for i in range(0, len(docs), BATCH):
            batch = docs[i:i+BATCH]
            tx = session.begin_transaction()
            for txt, meta in batch:
                tx.run("""
                    MERGE (c:DocChunk {chunk_id:$chunk_id})
                    SET c.chunk_index = $chunk_index,
                        c.source_pdf = $source_pdf,
                        c.text_preview = $text_preview
                """, 
                chunk_id=meta["chunk_id"],
                chunk_index=meta["chunk_index"],
                source_pdf=meta["source_pdf"],
                text_preview=txt[:2000])
                created += 1
            tx.commit()
    print(f"Created/updated {created} nodes in Neo4j.")
except Exception:
    print("Neo4j write error:")
    traceback.print_exc()
finally:
    driver.close()

print("Ingestion complete! ✅")


Loading PDF...
Extracted 132 pages; total chars: 170517
Split into 294 chunks
Creating embeddings (OpenAI)...
Created 294 embeddings (dim=1536)
FAISS index saved.
Writing chunks to Neo4j...
Created/updated 294 nodes in Neo4j.
Ingestion complete! ✅


In [21]:
# STEP 6c — Natural Chatbot Answer (no citations, smooth response)
import os, json, numpy as np
import faiss
from neo4j import GraphDatabase, basic_auth
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings

FAISS_DIR = "faiss_store"
TOP_K = 5
EMBED_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-3.5-turbo"

# load FAISS + metadata
index = faiss.read_index(os.path.join(FAISS_DIR, "index.faiss"))
with open(os.path.join(FAISS_DIR, "metadata.json"), "r", encoding="utf-8") as f:
    metadata_list = json.load(f)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

# embedder
emb = OpenAIEmbeddings(model=EMBED_MODEL)

query = input("Ask anything based on the course:\n").strip()
qvec = emb.embed_query(query)
qvec_np = np.array(qvec).astype("float32").reshape(1, -1)

D, I = index.search(qvec_np, TOP_K)
indices = I[0].tolist()

# Fetch chunk text from Neo4j
NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD = (
    os.environ.get("NEO4J_URI"),
    os.environ.get("NEO4J_USER"),
    os.environ.get("NEO4J_PASSWORD"),
)

driver = GraphDatabase.driver(NEO4J_URI, auth=basic_auth(NEO4J_USER, NEO4J_PASSWORD))

chunks = []
with driver.session() as session:
    for idx in indices:
        meta = metadata_list[idx]
        rec = session.run(
            "MATCH (c:DocChunk {chunk_index:$ci}) RETURN c.text_preview AS p",
            ci=meta["chunk_index"]
        ).single()
        preview = rec.get("p") if rec else ""
        chunks.append(preview)
driver.close()

context = "\n\n".join(chunks)

system_prompt = """
You are a friendly course assistant for a web development program.
Answer the user's questions in a simple, smooth, natural way—like a mentor explaining.
Do NOT mention chunks, retrieval, citations, or context.
Use ONLY the provided context. If something is missing, say "I’m not sure, the document didn’t mention that."
"""

user_prompt = f"""
User question: {query}

Course content context:
{context}

Now give a clear, straight-forward answer based ONLY on the above content.
"""

print("\nThinking...\n")

resp = client.chat.completions.create(
    model=LLM_MODEL,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0.3,
    max_tokens=300
)

answer = resp.choices[0].message.content
print("\n--- Answer ---\n")
print(answer)
print("\n--------------")


Ask anything based on the course:
 what are the best ai codng assisstants or tools?



Thinking...


--- Answer ---

Some popular AI coding assistants and tools include GitHub Copilot, Replit AI, and VS Code with AI extensions like GitHub Copilot, Codeium, and TabNine. These tools can suggest code, build entire projects, and provide real-time coding assistance to help you write code, solve problems, and guide you through challenges in your development projects.

--------------
