In [None]:
%%capture --no-stderr
%pip install pinecone
%pip install -U langchain_community tiktoken langchain_google_genai langchain langgraph  python-docx  docx2txt

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

google_api_key = os.getenv("GOOGLE_API_KEY")
host = os.getenv("DATABASE_HOST")
port = os.getenv("DATABASE_PORT")
user = os.getenv("DATABASE_USER")
password = os.getenv("DATABASE_PASSWORD")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_host = os.getenv("PINECONE_HOST")

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=google_api_key)

In [None]:
### Creating index in Pinecone
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index_name = "table-index"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )


In [5]:
table_descriptions = {
    "Departments": "The Departments table holds academic department information. Each department may be associated with multiple students, professors, and courses. Use it when filtering by department or analyzing department-specific activity.",

    "Students": "The Students table contains personal and academic enrollment details for students. Each student is associated with a department and can be enrolled in multiple courses. Use this table to filter students by department or analyze student enrollment.",

    "Professors": "The Professors table stores data about academic faculty. Each professor belongs to a department and may teach one or more courses. Use this when identifying course instructors or analyzing departmental faculty.",

    "Courses": "The Courses table contains the list of academic courses offered by departments. Each course is taught by a professor and belongs to a department. Use it for curriculum planning, instructor assignments, or course scheduling.",

    "Enrollments": "The Enrollments table tracks which students are enrolled in which courses, including the date of enrollment. Use it to analyze student course load, popularity of courses, or to join student and course data.",

    "Classrooms": "The Classrooms table defines physical classrooms, including building and room number, and capacity. It supports the scheduling system for courses. Use it to analyze classroom utilization or room assignments.",

    "Schedules": "The Schedules table links courses to classrooms and timeslots, including day of the week and start/end times. Use it to retrieve timetable information or detect scheduling conflicts.",

    "Grades": "The Grades table records student performance in courses by linking enrollment to a grade and grade point. Use it for GPA calculation, performance tracking, and academic reports.",

    "Assignments": "The Assignments table contains homework or project records tied to specific courses. Each assignment has a title, due date, and total marks. Use this table for academic workload or submission tracking.",

    "Submissions": "The Submissions table tracks which students submitted which assignments and the marks they received. Use this to evaluate assignment performance and submission timing.",

    "Clubs": "The Clubs table holds information about student clubs, including their faculty advisor and founding year. Use this when analyzing extracurricular involvement or managing student organizations.",

    "ClubMembers": "The ClubMembers table links students to clubs and records their role (e.g., President, Member). Use this table to retrieve club rosters, roles, or member count."
}


table_metadata = {
    "Departments": {
        "name": "Departments",
        "type": "table",
        "joins_with": ["Students", "Professors", "Courses"],
        "primary_keys": ["department_id"],
        "foreign_keys": [],
        "topics": ["academic departments", "head of department"]
    },
    "Students": {
        "name": "Students",
        "type": "table",
        "joins_with": ["Departments", "Enrollments", "Submissions", "ClubMembers"],
        "primary_keys": ["student_id"],
        "foreign_keys": ["department_id → Departments(department_id)"],
        "topics": ["student info", "enrollment", "academic records"]
    },
    "Professors": {
        "name": "Professors",
        "type": "table",
        "joins_with": ["Departments", "Courses", "Clubs"],
        "primary_keys": ["professor_id"],
        "foreign_keys": ["department_id → Departments(department_id)"],
        "topics": ["faculty", "academic staff", "course instructor"]
    },
    "Courses": {
        "name": "Courses",
        "type": "table",
        "joins_with": ["Departments", "Professors", "Enrollments", "Schedules", "Assignments"],
        "primary_keys": ["course_id"],
        "foreign_keys": [
            "department_id → Departments(department_id)",
            "professor_id → Professors(professor_id)"
        ],
        "topics": ["course catalog", "teaching", "curriculum"]
    },
    "Enrollments": {
        "name": "Enrollments",
        "type": "table",
        "joins_with": ["Students", "Courses", "Grades"],
        "primary_keys": ["enrollment_id"],
        "foreign_keys": [
            "student_id → Students(student_id)",
            "course_id → Courses(course_id)"
        ],
        "topics": ["course registration", "enrollment", "student load"]
    },
    "Classrooms": {
        "name": "Classrooms",
        "type": "table",
        "joins_with": ["Schedules"],
        "primary_keys": ["classroom_id"],
        "foreign_keys": [],
        "topics": ["physical space", "room capacity", "locations"]
    },
    "Schedules": {
        "name": "Schedules",
        "type": "table",
        "joins_with": ["Courses", "Classrooms"],
        "primary_keys": ["schedule_id"],
        "foreign_keys": [
            "course_id → Courses(course_id)",
            "classroom_id → Classrooms(classroom_id)"
        ],
        "topics": ["timetables", "class schedule", "room booking"]
    },
    "Grades": {
        "name": "Grades",
        "type": "table",
        "joins_with": ["Enrollments"],
        "primary_keys": ["grade_id"],
        "foreign_keys": ["enrollment_id → Enrollments(enrollment_id)"],
        "topics": ["marks", "GPA", "student performance"]
    },
    "Assignments": {
        "name": "Assignments",
        "type": "table",
        "joins_with": ["Courses", "Submissions"],
        "primary_keys": ["assignment_id"],
        "foreign_keys": ["course_id → Courses(course_id)"],
        "topics": ["homework", "projects", "due dates"]
    },
    "Submissions": {
        "name": "Submissions",
        "type": "table",
        "joins_with": ["Assignments", "Students"],
        "primary_keys": ["submission_id"],
        "foreign_keys": [
            "assignment_id → Assignments(assignment_id)",
            "student_id → Students(student_id)"
        ],
        "topics": ["student work", "marks obtained", "submission date"]
    },
    "Clubs": {
        "name": "Clubs",
        "type": "table",
        "joins_with": ["Professors", "ClubMembers"],
        "primary_keys": ["club_id"],
        "foreign_keys": ["faculty_advisor_id → Professors(professor_id)"],
        "topics": ["student organizations", "extracurriculars"]
    },
    "ClubMembers": {
        "name": "ClubMembers",
        "type": "table",
        "joins_with": ["Clubs", "Students"],
        "primary_keys": ["member_id"],
        "foreign_keys": [
            "club_id → Clubs(club_id)",
            "student_id → Students(student_id)"
        ],
        "topics": ["student roles", "club participation"]
    }
}


In [6]:
formatted_records = []

for i, (table_name, description) in enumerate(table_descriptions.items(), start=1):
    metadata = table_metadata.get(table_name, {}).copy()
    metadata["table_name"] = table_name  # for easier reference
    
    record = {
        "_id": str(i),
        "chunk_text": description,
        **metadata  
    }
    formatted_records.append(record)

# Pretty print the result
import json
print(json.dumps(formatted_records, indent=4))


[
    {
        "_id": "1",
        "chunk_text": "The Departments table holds academic department information. Each department may be associated with multiple students, professors, and courses. Use it when filtering by department or analyzing department-specific activity.",
        "name": "Departments",
        "type": "table",
        "joins_with": [
            "Students",
            "Professors",
            "Courses"
        ],
        "primary_keys": [
            "department_id"
        ],
        "foreign_keys": [],
        "topics": [
            "academic departments",
            "head of department"
        ],
        "table_name": "Departments"
    },
    {
        "_id": "2",
        "chunk_text": "The Students table contains personal and academic enrollment details for students. Each student is associated with a department and can be enrolled in multiple courses. Use this table to filter students by department or analyze student enrollment.",
        "name": "Students"

In [None]:
### Inserting table descriptions into Pinecone index

from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(host=index_host)

index.upsert_records(
    "table-details-university",
    formatted_records,
)

In [None]:
# TESTING THE INDEX WITH THE QUERY

from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(host=index_host)

results = index.search(
    namespace="table-details", 
    query={
        "inputs": {"text":  "Show each employee's full name and their hire date."}, 
        "top_k": 2
    },
)

In [None]:
for docs in results.result.hits:
    print(docs.fields)
    print("==========")

In [9]:
from docx import Document
import re

def load_docx_text(filepath):
    doc = Document(filepath)
    full_text = '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])
    return full_text

def split_column_chunks(text):
    pattern = r"(Column:.*?)(?=Column:|\Z)"  # Split till next Column or end of doc
    chunks = re.findall(pattern, text, flags=re.DOTALL)
    return [chunk.strip() for chunk in chunks]

# === USAGE ===
file_path = "F:\\Panaversity\\langchain-academy\\module-2\\Querygpt\\ColumnDetails_university.docx"  # <-- Replace this with your file path
doc_text = load_docx_text(file_path)
column_chunks = split_column_chunks(doc_text)

# === Output or save ===
formatted_column_chunks = []
for i, chunk in enumerate(column_chunks, 1):
    record = {
        "_id": str(i),
        "chunk_text": chunk,
    }
    formatted_column_chunks.append(record)

print(formatted_column_chunks)


[{'_id': '1', 'chunk_text': 'Column: student_id   #### **Type:** SERIAL   #### **Table:** students  ### **Primary Key:** Yes  ### **Foreign Key:** No  #####\n **Description:** Auto-incremented unique identifier for each student. Used to reference students in related tables like enrollments, grades, and submissions.\n####'}, {'_id': '2', 'chunk_text': 'Column: first_name  ####  **Type:** VARCHAR(50)  ####  **Table:** students   ####  **Primary Key:** No   ####  **Foreign Key:** No  \n**Description:** Stores the given name of the student. Often used in conjunction with last_name for display or search purposes.\n####'}, {'_id': '3', 'chunk_text': 'Column: last_name  ####   **Type:** VARCHAR(50)  ####  **Table:** students ####  **Primary Key:** No  ####   **Foreign Key:** No  \n **Description:** Stores the family or surname of the student. Combined with first_name for identification in interfaces or documents.\n####'}, {'_id': '4', 'chunk_text': 'Column: email  ####   **Type:** VARCHAR(100

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(host=index_host)

index.upsert_records(
    "column-details-university",
    formatted_column_chunks,
)

In [None]:
# GETTING SQL QUERY FROM THE DOCS AND MAKE CHUNKS FROM IT


from docx import Document

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    full_text = [para.text.strip() for para in doc.paragraphs if para.text.strip() != ""]

    joined_text = "\n".join(full_text)
    return joined_text

def chunk_sql_queries_from_docx(text):
    chunks = []
    raw_chunks = text.split("Question:")
    
    for i,chunk in enumerate(raw_chunks, start=1):
        if not chunk.strip():
            continue
        try:
            question_part, query_part = chunk.strip().split("SQL query:")

            question = question_part.strip()
            sql_query = query_part.strip()
            chunk_val = f"Question: {question}\nSQL query: {sql_query}"
            chunks.append({
                "_id": str(i),
                "chunk_text": chunk_val
            })
        except ValueError:
            print(f"Skipping chunk due to format issue...")
            continue  # Skip if it doesn't split cleanly
    
    return chunks

# Usage:
docx_path = "F:\\Panaversity\\langchain-academy\\module-2\\Querygpt\\SQL_Query_Example_university.docx"  # Replace with your actual path
doc_text = extract_text_from_docx(docx_path)
# print("Extracted Document Text:") 
chunked_queries = chunk_sql_queries_from_docx(doc_text)
# Example preview
print("Chunked Queries:")
print(chunked_queries) 

Chunked Queries:
[{'_id': '2', 'chunk_text': "Question: Get all students' full names and emails.\nSQL query: SELECT first_name, last_name, email FROM students;"}, {'_id': '3', 'chunk_text': 'Question: List all course names and credit values.\nSQL query: SELECT course_name, credits FROM courses;'}, {'_id': '4', 'chunk_text': "Question: Get names of all professors in the Computer Science department.\nSQL query: SELECT first_name, last_name FROM professors WHERE department_id = (SELECT department_id FROM departments WHERE department_name = 'Computer Science');"}, {'_id': '5', 'chunk_text': 'Question: Show all classroom room numbers and capacities.\nSQL query: SELECT room_number, capacity FROM classrooms;'}, {'_id': '6', 'chunk_text': 'Question: List all departments.\nSQL query: SELECT department_name FROM departments;'}, {'_id': '7', 'chunk_text': 'Question: Get all students who enrolled after 2020.\nSQL query: SELECT first_name, last_name FROM students WHERE enrollment_year > 2020;'}, {'

In [None]:
## Storing chunked queries in Pinecone index
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(host=index_host)

index.upsert_records(
    "query-example-university",
    chunked_queries,
)

In [None]:
query_fetch = index.search(
    namespace="query-example-university", 
    query={
        "inputs": {"text":  "QUESION"}, 
        "top_k": 7
    },
)