<h1>PDF LOADING

In [1]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from typing import List
from langchain.schema import Document

def get_pdf_paths(root_dir: str) -> List[str]:
    """Get all PDF paths recursively from a directory"""
    pdf_paths = []
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.lower().endswith(".pdf"):
                full_path = os.path.join(dirpath, filename)
                pdf_paths.append(full_path)
    return pdf_paths

# Define your directories (replace with actual paths)
dir_a = "/home/rojan/Music/SS/A"
dir_b_best = "/home/rojan/Music/SS/B/Best"
dir_b_road = "/home/rojan/Music/SS/B/Road"

# Collect all PDF paths from all directories
all_pdf_paths = []
all_pdf_paths += get_pdf_paths(dir_a)
all_pdf_paths += get_pdf_paths(dir_b_best)
all_pdf_paths += get_pdf_paths(dir_b_road)

# Load all PDF documents with enhanced processing
documents: List[Document] = []

for pdf_path in all_pdf_paths:
    try:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        
        # Initialize section tracking for this document
        current_section = "Introduction"
        source_file = os.path.basename(pdf_path)
        
        for doc in docs:
            # Clean PDF artifacts and normalize checkmarks
            cleaned_content = re.sub(r'|', '[CHECK]', doc.page_content)
            cleaned_content = re.sub(r'\x0c', '', cleaned_content)  # Remove form feeds
            
            # Detect section headers (modify pattern as needed)
            section_match = re.search(r'^\n([A-Z][A-Za-z ]+)\n', cleaned_content)
            if section_match:
                current_section = section_match.group(1).strip()
            
            # Enhanced metadata
            doc.metadata.update({
                "source_folder": os.path.dirname(pdf_path),
                "source_file": source_file,
                "page_number": doc.metadata["page"] + 1,  # Convert to 1-based numbering
                "section": current_section,
                "content_length": len(cleaned_content),
                "content_type": "checklist" if "[CHECK]" in cleaned_content else "explanatory",
                "document_type": "Best" if "Best" in pdf_path else "Road" if "Road" in pdf_path else "A"
            })
            
            # Update the cleaned content
            doc.page_content = cleaned_content
            
            documents.append(doc)
            
       
        
    except Exception as e:
        print(f"Error loading : {str(e)}")
        continue



Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing ob

<h1>EMBEDDINGS

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents: List[Document]) -> List[Document]:
    """Split documents while preserving metadata and context"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=[
            "\n\n• ",  # Checklist items
            "\n\n",    # Major sections
            "\n• ",     # Sub-items
            "\n- ",     # Alternative list format
            "\n",       # New lines
            " ",         # Words
            ""
        ]
    )
    
    split_docs = []
    for doc in documents:
        chunks = text_splitter.split_text(doc.page_content)
        for chunk in chunks:
            # Create new document with inherited metadata
            metadata = doc.metadata.copy()
            metadata.update({
                "chunk_id": f"{metadata['source_file']}-p{metadata['page_number']}-{len(split_docs)}",
                "content_length": len(chunk)
            })
            
            new_doc = Document(
                page_content=chunk,
                metadata=metadata
            )
            split_docs.append(new_doc)
    
    return split_docs

# Split the loaded documents
split_docs = split_documents(documents)
print(f"Total chunks created: {len(split_docs)}")

Total chunks created: 199


In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

def create_vector_store(documents: List[Document]):
    """Create Chroma vector store with metadata indexing"""
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
    
    return Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        collection_metadata={"hnsw:space": "cosine"},
        persist_directory="./vector_store",
        # Remove the explicit metadatas parameter
    )

# Create and persist vector store
vector_store = create_vector_store(split_docs)
print("Vector store created and persisted at ./vector_store")

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Vector store created and persisted at ./vector_store


<h1>Retrieving 

In [4]:
import os
import json

# Create prompts directory if not exists
PROMPT_DIR = "prompts"
os.makedirs(PROMPT_DIR, exist_ok=True)

# Define and save enhanced skill tree prompt
SKILL_TREE_PROMPT = """ROLE: Expert Learning Path Designer
TASK: Generate structured skill trees in VALID JSON format

STRICT FORMAT RULES:
1. Use double quotes ONLY
2. No markdown/code blocks
3. Valid JSON syntax required
4. No trailing commas
5. Include ALL brackets
6. Maintain proper indentation

EXAMPLE RESPONSE:
{{
    "skill_tree": {{
        "title": "Web Development",
        "prerequisites": ["Basic Computer Skills"],
        "learning_path": [
            {{
                "order": 1,
                "topic": "HTML Fundamentals",
                "resources": ["MDN Web Docs", "FreeCodeCamp HTML Course"],
                "milestones": ["Build basic page structure", "Create semantic markup"]
            }},
            {{
                "order": 2,
                "topic": "CSS Styling",
                "resources": ["CSS Tricks Guide", "Flexbox Froggy"],
                "milestones": ["Style responsive layouts", "Implement CSS animations"]
            }}
        ],
        "dependencies": ["JavaScript", "Browser APIs"]
    }}
}}

ANALYSIS STEPS:
1. Identify core concepts from query: {query}
2. Extract prerequisites from context: {context}
3. Create ordered learning path
4. Select relevant resources
5. Define clear milestones

CONTEXT FROM KNOWLEDGE BASE:
{context}

USER QUERY: {query}

RESPONSE (JSON ONLY):
"""

# Save to file
with open(os.path.join(PROMPT_DIR, "skill_tree_generator.txt"), "w") as f:
    f.write(SKILL_TREE_PROMPT)

print("Enhanced prompt template created in 'prompts' directory")

Enhanced prompt template created in 'prompts' directory


In [None]:
import pandas as pd
import json
import time
from bert_score import score
from tqdm import tqdm
import google.generativeai as genai
from typing import List, Dict

class SkillTreeGenerator:
    def __init__(self, vector_store, api_key: str):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
        self.retriever = vector_store.as_retriever(search_kwargs={"k": 3})

    def _load_prompt(self, name: str) -> str:
        with open(f"prompts/{name}.txt") as f:
            return f.read()
    
    def generate_path(self, query: str) -> Dict:
        start_time = time.time()
        response = None  # Initialize response variable
        try:
            docs = self.retriever.invoke(query)
            context = "\n".join([d.page_content for d in docs])
            prompt = self._load_prompt("skill_tree_generator").format(
                context=context,
                query=query
            )
            response = self.model.generate_content(prompt)
            json_str = response.text.replace('```json', '').replace('```', '')
            return {
                "response": json.loads(json_str),
                "latency": time.time() - start_time
            }
        except json.JSONDecodeError:
            return {"response": {"error": "Invalid JSON format"}, "latency": time.time() - start_time}
        except Exception as e:
            # Safely handle missing response
            raw_response = getattr(response, 'text', 'No response generated')
            print(f"\nError processing query: {query}")
            print(f"Raw response: {raw_response}")
            print(f"Error details: {str(e)}")
            return {"response": {"error": f"API Error: {str(e)}"}, "latency": time.time() - start_time}

def extract_content(json_str: str) -> str:
    """Extract semantic content from JSON responses"""
    try:
        data = json.loads(json_str)
        if "error" in data:
            return data["error"]
        elif "skill_tree" in data:
            skill_tree = data["skill_tree"]
            content = [
                skill_tree.get("title", ""),
                " ".join(skill_tree.get("prerequisites", [])),
                " ".join([topic.get("topic", "") for topic in skill_tree.get("learning_path", [])])
            ]
            return " ".join(content).strip()
        return json_str
    except:
        return json_str

def main(vector_store):
    # Read groundtruth data with proper JSON formatting
    df = pd.read_csv('gt.csv').reset_index(drop=True)
    
    # Initialize model
    processor = SkillTreeGenerator(vector_store, "AIzaSyCRZbC56Mhvw-xv0M3YDG0fbv-Wq3zLBkc")
    
    # Generate responses with latency tracking
    print("Generating model responses...")
    results = []
    for question in tqdm(df['Questions'], desc="Processing queries"):
        result = processor.generate_path(question)
        results.append({
            "Model Response": json.dumps(result["response"]),
            "Latency": result["latency"]
        })
    
    # Merge results with alignment fix
    results_df = pd.DataFrame(results).reset_index(drop=True)
    df = pd.concat([df, results_df], axis=1)
    
    # Preprocess for BERTScore
    df['Processed GT'] = df['Answers'].apply(extract_content)
    df['Processed Response'] = df['Model Response'].apply(extract_content)
    df['is_error'] = df['Model Response'].str.contains('"error":')
    
    # Sanity check input samples
    print("\n=== Input Validation ===")
    for i in range(3):
        print(f"\nSample {i+1}:")
        print(f"Question: {df['Questions'].iloc[i][:100]}...")
        print(f"GT: {df['Processed GT'].iloc[i][:200]}...")
        print(f"Response: {df['Processed Response'].iloc[i][:200]}...")
    
    # Calculate metrics
    print("\nCalculating metrics...")
    valid_mask = ~df['is_error']
    
    # BERT Scores for valid responses only
    if valid_mask.any():
        references = df.loc[valid_mask, 'Processed GT'].tolist()
        candidates = df.loc[valid_mask, 'Processed Response'].tolist()
        P, R, F1 = score(candidates, references, lang='en', verbose=True)
    else:
        P, R, F1 = [pd.Series([0.0])] * 3
    
    # Print metrics
    print("\n=== Aggregate Metrics ===")
    print(f"BERT Precision: {P.mean().item():.3f}" if valid_mask.any() else "BERT Precision: N/A (all errors)")
    print(f"BERT Recall: {R.mean().item():.3f}" if valid_mask.any() else "BERT Recall: N/A (all errors)")
    print(f"BERT F1: {F1.mean().item():.3f}" if valid_mask.any() else "BERT F1: N/A (all errors)")
    print(f"Average Latency: {df['Latency'].mean():.2f}s")
    print(f"Total Queries: {len(df)}")
    print(f"Success Rate: {(valid_mask.mean() * 100):.1f}%")
    
    # Print sample responses
    print("\n=== Sample Responses ===")
    for i in range(2):
        print(f"\nQuestion {i+1}: {df['Questions'].iloc[i]}")
        print(f"Latency: {df['Latency'].iloc[i]:.2f}s")
        if valid_mask.any() and i < len(P):
            print(f"BERT Precision: {P[i].item():.3f}")
            print(f"BERT Recall: {R[i].item():.3f}")
            print(f"BERT F1: {F1[i].item():.3f}")
        print("Ground Truth:", json.loads(df['Answers'].iloc[i]))
        print("Model Response:", json.loads(df['Model Response'].iloc[i]))
        print("-" * 80)

if __name__ == "__main__":
    main(vector_store)

Generating model responses...


Processing queries: 100%|██████████| 7/7 [00:47<00:00,  6.81s/it]



=== Input Validation ===

Sample 1:
Question: How to learn PostgreSQL from basics? ...
GT: PostgreSQL Fundamentals Basic Understanding of Databases Familiarity with SQL Fundamentals Data Definition Language (DDL) Data Manipulation Language (DML) Data Control Language (DCL) Advanced Features...
Response: PostgreSQL Learning Path Basic Computer Skills Familiarity with command line (optional) Introduction to Relational Databases and PostgreSQL Basic PostgreSQL Setup and Configuration SQL Fundamentals Ad...

Sample 2:
Question: How to learn Spring Boot from basics? ...
GT: Java and Spring Fundamentals Basic Programming Concepts Understanding of Object-Oriented Programming (OOP) Java Fundamentals Spring Framework Basics Introduction to Spring Boot Web Development Data Ac...
Response: Spring Boot Learning Path Basic Java Programming Understanding of Web Concepts Java Fundamentals Spring Core Spring Boot Basics Spring MVC Spring Data JPA Spring Security Spring Boot Actuator Spring C...

Samp

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 24.06it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 459.90it/s]

done in 0.05 seconds, 150.57 sentences/sec

=== Aggregate Metrics ===
BERT Precision: 0.865
BERT Recall: 0.890
BERT F1: 0.877
Average Latency: 6.81s
Total Queries: 7
Success Rate: 100.0%

=== Sample Responses ===

Question 1: How to learn PostgreSQL from basics? 
Latency: 6.85s
BERT Precision: 0.854
BERT Recall: 0.857
BERT F1: 0.856
Ground Truth: {'skill_tree': {'title': 'PostgreSQL Fundamentals', 'prerequisites': ['Basic Understanding of Databases', 'Familiarity with SQL'], 'learning_path': [{'order': 1, 'topic': 'Fundamentals', 'resources': ['PostgreSQL Official Documentation', 'SQL Basics Tutorial'], 'milestones': ['Understand relational database concepts', 'Learn SQL basics', 'Explore PostgreSQL features', 'Install and configure PostgreSQL']}, {'order': 2, 'topic': 'Data Definition Language (DDL)', 'resources': ['PostgreSQL DDL Documentation', 'Table Creation Guide'], 'milestones': ['Create and manage tables', 'Work with indexes', 'Create and use views', 'Understand and manage sche




In [14]:
for model in genai.list_models():
    print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/learnlm-1.5-pro-experimental
models/embedding-001
models/text-embedding-004
models/aqa
models/imagen-3.