# 📘 University Chatbot – Faculty Vector DB with Gemini & Pinecone

This notebook:
- Loads faculty info from PDF
- Uses Gemini API to extract metadata
- Embeds the data using HuggingFace
- Stores in Pinecone
- Performs retrieval on user query

In [1]:
# 📌 SECTION 1: Install Required Libraries
pip install -q langchain google-generativeai pinecone-client chromadb PyMuPDF transformers

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2607580720.py, line 2)

In [12]:
# 📌 SECTION 2: Set API Keys
import os
import google.generativeai as genai

# Set your API keys
os.environ["GOOGLE_API_KEY"] = "AIzaSyD8ELD3DZ3g4aZitXTeWR-_pbYlWIRNI8c"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

from pinecone import Pinecone, ServerlessSpec

# ✅ Initialize the client
pc = Pinecone(api_key="pcsk_6hni7M_GL6xHLRBmhvTFhZgZ5vaQFMFtznHcxYucDG51z4JeEgciQPJ9azzisUU7yywVWy")

index_name = "facultyrag"

# ✅ Check if index exists and create if not
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1024,  # Use 1024 if you're using llama-text-embed-v2
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# ✅ Connect to the index
index = pc.Index(index_name)


In [4]:
# 📌 SECTION 3: Extract Text from PDF
import fitz  # PyMuPDF

def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    return [page.get_text() for page in doc]

text_chunks = extract_pdf_text("faculty_test.pdf")
print(f"Loaded {len(text_chunks)} pages.")

Loaded 3 pages.


In [20]:
import re
import json
import google.generativeai as genai

def extract_metadata_with_gemini(text):
    prompt = f"""
Extract the following fields in JSON format:
- name
- designation
- email
- research_area
- department (if present)
- employee_id
- orcid (if present)
- h_index
- patents
- books
- consultancy_projects

Text:
\"\"\"{text}\"\"\"

Respond ONLY with a valid JSON object, no explanation.
"""
    # Use your preferred Gemini model
    model = genai.GenerativeModel("models/gemini-2.5-flash-preview-04-17")

    try:
        response = model.generate_content(prompt)
        raw = response.text.strip()

        # ✅ Remove Markdown code block wrappers like ```json and ```
        cleaned = re.sub(r"^```(?:json)?|```$", "", raw, flags=re.MULTILINE).strip()

        print("🧼 Cleaned Gemini Output:\n", cleaned)
        return json.loads(cleaned)

    except Exception as e:
        print("❌ JSON Parsing Error:", e)
        return {}


In [21]:
faculty_docs = []

for text in text_chunks:
    metadata = extract_metadata_with_gemini(text)
    if metadata:
        faculty_docs.append((text, metadata))


🧼 Cleaned Gemini Output:
 {
  "name": "Dr. Asnath Victy Phamila Y",
  "designation": "Professor",
  "email": "asnathvicty.phamila@vit.ac.in",
  "research_area": "Digital Image Processing, Computer Vision, Sensor Networks, Deep Learning",
  "employee_id": "50590",
  "orcid": "https://orcid.org/0000-0002-5030-1165",
  "h_index": "6",
  "patents": [],
  "books": [],
  "consultancy_projects": [
    "Polygon Matching"
  ]
}
🧼 Cleaned Gemini Output:
 {}
🧼 Cleaned Gemini Output:
 {
  "name": null,
  "designation": null,
  "email": null,
  "research_area": [
    "Visual Sensor Networks",
    "Critical Infrastructure Protection",
    "IoT",
    "Object Detection"
  ],
  "department": null,
  "employee_id": null,
  "orcid": null,
  "h_index": null,
  "patents": null,
  "books": null,
  "consultancy_projects": null
}


In [None]:
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings import HuggingFaceEmbeddings

# ✅ Create Pinecone client
pc = Pinecone(api_key="pcsk_6hni7M_GL6xHLRBmhvTFhZgZ5vaQFMFtznHcxYucDG51z4JeEgciQPJ9azzisUU7yywVWy")

index_name = "facultyrag"

# ✅ Create index only if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")  # use your actual region
    )

# ✅ Connect to index
index = pc.Index(index_name)

# ✅ Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print(f"Number of documents to upload: {len(faculty_docs)}")

# ✅ Upsert documents
for i, (doc, metadata) in enumerate(faculty_docs):
    vector = embedding_model.embed_query(doc)
    index.upsert(vectors=[
        {
            "id": f"faculty_{i}",
            "values": vector,
            "metadata": metadata
        }
    ])


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [None]:
# 📌 SECTION 6: Ask Query and Retrieve from Pinecone
def search_faculty(query, top_k=3):
    vector = embedding_model.embed_query(query)
    results = index.query(
        vector=vector,
        top_k=top_k,
        include_metadata=True
    )
    return results['matches']

# Example Query
query = "Which faculty work in deep learning or image processing?"
matches = search_faculty(query)
for match in matches:
    print("---")
    print("Name:", match["metadata"].get("name"))
    print("Research Area:", match["metadata"].get("research_area"))
    print("Patents:", match["metadata"].get("patents", "N/A"))