## Extract From Raw Files

In [None]:
import os
import re
import json
from docx import Document  # pip install python-docx

# ==========================================
# 1. HELPER FUNCTIONS
# ==========================================

def clean_text(text):
    """Normalizes whitespace."""
    return re.sub(r'\s+', ' ', text).strip()

def save_markdown(base_dir, folder, filename, lines):
    """Helper to write content to the correct folder."""
    full_folder_path = os.path.join(base_dir, folder)
    os.makedirs(full_folder_path, exist_ok=True)
    
    full_path = os.path.join(full_folder_path, f"{filename}.md")
    
    with open(full_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(lines))
    print(f"Saved -> {full_path}")

def get_part_folder(chapter_title):
    """Determines the Part folder based on the Chapter Number."""
    match = re.match(r'^(\d+)', chapter_title.strip())
    
    if not match:
        return "Front_Matter"
    
    chap_num = int(match.group(1))
    
    # Mapping based on your specific book structure
    if 1 <= chap_num <= 3: return "Part_I_Business_of_Trading"
    if 4 <= chap_num <= 6: return "Part_II_Foundations"
    if 7 <= chap_num <= 8: return "Part_III_Methodology"
    if 9 <= chap_num <= 13: return "Part_IV_Indicators"
    if chap_num >= 14: return "Part_V_Risk_Management"
    
    return "Other"

# ==========================================
# 2. GLOSSARY EXTRACTION (THE FIX)
# ==========================================

def is_valid_glossary_entry(term, definition):
    """
    Returns True only if this looks like a valid Glossary entry.
    Filters out 'Risk', 'Financial Year', and Table Headers.
    """
    term_lower = term.lower()
    
    # 1. Reject Headers and Label rows
    if term_lower in ["term", "word", "phrase"]: return False
    if "financial year" in term_lower: return False
    if "entry date" in term_lower: return False
    
    # 2. Reject Data/Math rows (The "Risk" Table leaks)
    if "$" in definition: return False      # Rejects "$2.68"
    if "$" in term: return False
    if "risk" in term_lower and any(c.isdigit() for c in definition): return False # Rejects "Risk: $200"

    # 3. Reject Short/Empty rubbish
    if len(term) < 2: return False
    if len(definition) < 10: return False  # Definitions are usually sentences

    return True

def extract_glossary_universal(docx_path, output_json):
    doc = Document(docx_path)
    print(f"--- Scanning {docx_path} for Glossary ---")

    glossary_data = {}
    found_header = False
    
    # --- STRATEGY 1: CHECK TABLES (WITH FILTER) ---
    print("Checking Tables with Smart Filters...")
    for table in doc.tables:
        if len(table.columns) == 2:
            # Check if this table is actually a glossary
            # We count how many valid rows it has.
            valid_rows_in_this_table = 0
            temp_data = {}
            
            for row in table.rows:
                try:
                    cells = row.cells
                    term = clean_text(cells[0].text)
                    defn = clean_text(cells[1].text)
                    
                    if is_valid_glossary_entry(term, defn):
                        temp_data[term] = defn
                        valid_rows_in_this_table += 1
                except IndexError:
                    continue
            
            # LOGIC: A "Risk" table might have 1 accidental match, but a Glossary has many.
            # We only save if we found > 3 valid terms in this specific table.
            if valid_rows_in_this_table > 3:
                glossary_data.update(temp_data)

    if len(glossary_data) > 0:
        print(f"Found {len(glossary_data)} terms in Tables.")
    
    # --- STRATEGY 2: CHECK PARAGRAPHS (The 'Gap' Method) ---
    # Only runs if we didn't find enough terms in tables, or to supplement.
    print("Checking Paragraphs...")
    
    for para in doc.paragraphs:
        text = para.text.strip()
        style = para.style.name
        
        # 1. Find the Header
        if "Glossary" in text and ("Heading" in style or "Title" in style):
            found_header = True
            print("Found 'Glossary' Header in text.")
            continue
            
        # Stop if we hit the next major section (like Index)
        if found_header and "Heading" in style and "Index" in text:
            break
            
        if found_header and text:
            bold_term = ""
            definition = ""
            
            # OPTION A: BOLD DETECTION
            for run in para.runs:
                if run.bold:
                    bold_term += run.text
                else:
                    break 
            bold_term = clean_text(bold_term)
            
            # OPTION B: REGEX SPLIT
            if not bold_term:
                parts = re.split(r'\t|\s{2,}', text, maxsplit=1)
                if len(parts) == 2:
                    bold_term = clean_text(parts[0])
                    definition = clean_text(parts[1])
            elif bold_term and len(bold_term) < len(text):
                definition = text[len(bold_term):].strip()

            # Clean definition
            definition = re.sub(r'^[:\-\s]+', '', definition)

            # FINAL VALIDATION BEFORE SAVING
            if bold_term and definition:
                if is_valid_glossary_entry(bold_term, definition):
                    glossary_data[bold_term] = definition

    # --- SAVE ---
    if not glossary_data:
        print("WARNING: Could not find terms.")
    else:
        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(glossary_data, f, indent=4)
        print(f"SUCCESS: Saved {len(glossary_data)} terms to {output_json}")

# ==========================================
# 3. MARKDOWN EXTRACTION (EXISTING LOGIC)
# ==========================================

def extract_to_markdown_structure(docx_path):
    doc = Document(docx_path)
    print(f"--- Processing {docx_path} for RAG ---")

    current_lines = []
    current_filename = "Front_Matter"
    current_folder = "Front_Matter"
    
    base_dir = "RAG_Knowledge_Base"
    os.makedirs(base_dir, exist_ok=True)

    for para in doc.paragraphs:
        text = clean_text(para.text)
        style = para.style.name
        
        if not text:
            continue

        if style.startswith("Heading 1"):
            if current_lines:
                save_markdown(base_dir, current_folder, current_filename, current_lines)
            
            current_folder = get_part_folder(text)
            
            safe_name = re.sub(r'[^\w\s-]', '', text).strip().replace(' ', '_')
            if safe_name.isdigit(): 
                current_filename = f"Chapter_{safe_name}"
            else:
                current_filename = safe_name
                
            current_lines = [f"# {text}"]
            print(f"Started: {current_filename} ({current_folder})")

        elif style.startswith("Heading 2"):
            current_lines.append(f"\n## {text}")

        elif style.startswith("Heading 3"):
            current_lines.append(f"\n### {text}")

        else:
            current_lines.append(text)

    if current_lines:
        save_markdown(base_dir, current_folder, current_filename, current_lines)

# ==========================================
# 4. EXECUTION
# ==========================================
if __name__ == "__main__":
    file_path = "Trading_Book.docx" 
    
    # 1. Extract Chapters
    extract_to_markdown_structure(file_path)
    
    # 2. Extract Glossary
    extract_glossary_universal(file_path, "glossary_tags.json")

## Data Chunking

In [None]:
import os
import json
import glob
import re

def natural_sort_key(s):
    """Sorts Chapter_2 before Chapter_10"""
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split('([0-9]+)', s)]

def normalize_text(text):
    """
    Fixes encoding artifacts and standardizes punctuation
    based on your specific request.
    """
    # 1. Replace Unicode artifacts with standard chars
    replacements = {
        "\u2019": "'",  # Right single quote -> '
        "\u2018": "'",  # Left single quote -> '
        "\u2013": "-",  # En dash -> -
        "\u2014": " - ", # Em dash -> space hyphen space (better for reading)
        "\u201c": '"',  # Left double quote -> "
        "\u201d": '"',  # Right double quote -> "
        "â€¦": "...",     # Ellipsis
    }
    
    for old, new in replacements.items():
        text = text.replace(old, new)

    # 2. Collapse whitespace (removes newlines inside paragraphs)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def parse_markdown_clean(md_folder_path, output_json_path):
    all_chunks = []
    chunk_counter = 0 
    
    # 1. Get and Sort Files
    md_files = glob.glob(os.path.join(md_folder_path, "**", "*.md"), recursive=True)
    md_files.sort(key=lambda f: natural_sort_key(os.path.basename(f)))
    
    print(f"Processing {len(md_files)} files...")

    for file_path in md_files:
        filename = os.path.basename(file_path)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        blocks = content.split('\n\n')

        # --- Metadata Trackers ---
        # Heuristic: Extract "1" from "Chapter_1.md"
        chapter_match = re.search(r'Chapter_(\d+)', filename)
        current_chapter = chapter_match.group(1) if chapter_match else "0"
        
        current_h1 = ""
        current_h2 = ""
        current_h3 = ""

        for block in blocks:
            block = block.strip()
            if not block: continue

            # --- HEADER UPDATES ---
            if block.startswith('# '):
                current_h1 = block[2:].strip()
                current_h2 = ""
                current_h3 = ""
                continue 
            if block.startswith('## '):
                current_h2 = block[3:].strip()
                current_h3 = ""
                continue
            if block.startswith('### '):
                current_h3 = block[4:].strip()
                continue

            # --- CONTEXT PATH ---
            # Result: "Trading is a business > Develop a SMART plan"
            context_parts = [p for p in [current_h1, current_h2, current_h3] if p]
            full_path = " > ".join(context_parts)
            
            if not full_path: 
                full_path = "Intro"

            # --- CLEAN TEXT ---
            clean_text = normalize_text(block)

            # --- COMPACT JSON OBJECT ---
            chunk_obj = {
                "id": chunk_counter,
                "chapter": current_chapter,
                "path": full_path,
                "text": clean_text
            }
            
            all_chunks.append(chunk_obj)
            chunk_counter += 1

    # --- SAVE WITH UTF-8 VISIBILITY ---
    with open(output_json_path, 'w', encoding='utf-8') as f:
        # ensure_ascii=False is the specific fix for the "\u2019" issue
        json.dump(all_chunks, f, indent=2, ensure_ascii=False)
    
    print(f"Success! Saved {len(all_chunks)} clean chunks to {output_json_path}")

# --- EXECUTION ---
if __name__ == "__main__":
    parse_markdown_clean("RAG_Knowledge_Base", "knowledge_base_clean.json")

## Assign tags (Backed with AI) from glossary tags

In [None]:
import os
import json
import time
from dotenv import load_dotenv
from groq import Groq

# 1. Setup
load_dotenv()

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

INPUT_FILE = "knowledge_base_clean.json"
TAGS_FILE = "glossary_tags.json"
OUTPUT_FILE = "knowledge_base_tagged.json"

# 2. Load Data
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    chunks = json.load(f)

with open(TAGS_FILE, 'r', encoding='utf-8') as f:
    glossary = json.load(f)

# Convert glossary keys to a single string for the prompt
# We only need the terms (keys), not the definitions, for the AI to pick from.
glossary_terms = list(glossary.keys())
glossary_string = ", ".join(glossary_terms)

print(f"Loaded {len(chunks)} chunks and {len(glossary_terms)} glossary terms.")

# 3. Check for existing progress (Resume capability)
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        try:
            tagged_chunks = json.load(f)
            # Create a set of IDs that are already done
            processed_ids = {item['id'] for item in tagged_chunks}
            print(f"Resuming... {len(processed_ids)} chunks already tagged.")
        except json.JSONDecodeError:
            tagged_chunks = []
            processed_ids = set()
else:
    tagged_chunks = []
    processed_ids = set()

# 4. Helper Function to Save
def save_progress():
    """Rewrites the full JSON file with current progress"""
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(tagged_chunks, f, indent=2, ensure_ascii=False)
    # print("Saved.") # Optional: uncomment if you want spam

# 5. Main Loop
for chunk in chunks:
    # Skip if already processed
    if chunk['id'] in processed_ids:
        continue
    
    chunk_text = chunk.get('text', '')
    if len(chunk_text) < 50: 
        # Skip tiny chunks (likely just headers), append empty tags
        chunk['tags'] = []
        tagged_chunks.append(chunk)
        processed_ids.add(chunk['id'])
        continue

    # --- AI PROMPT ---
    # We ask for a Strict JSON list output
    prompt = f"""
    You are a data labeling assistant.
    
    TASK:
    Analyze the text below and identify which terms from the provided 'Glossary List' are highly relevant to the text.
    
    RULES:
    1. STRICTLY output only a valid JSON list of strings (e.g., ["Term A", "Term B"]).
    2. If no terms match, output an empty list [].
    3. Do NOT invent new tags. Only use terms exactly as written in the Glossary List.
    4. Do not output markdown, explanations, or code blocks. Just the list.
    
    TEXT TO ANALYZE:
    "{chunk_text[:2000]}" 
    (Context: {chunk.get('path', '')})
    
    GLOSSARY LIST:
    {glossary_string}
    """

    try:
        completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful JSON-only assistant."},
                {"role": "user", "content": prompt}
            ],
            model="openai/gpt-oss-120b", # Using the model you requested
            temperature=0.0, # Keep it deterministic
        )

        response_content = completion.choices[0].message.content.strip()
        
        # --- PARSING THE RESPONSE ---
        # Sometimes models add ```json ... ``` despite instructions. Clean it.
        if "```" in response_content:
            response_content = response_content.replace("```json", "").replace("```", "").strip()
        
        # Parse text to List
        try:
            suggested_tags = json.loads(response_content)
        except:
            # Fallback if AI output messy text (e.g. "The tags are ['A', 'B']")
            # We assume no tags if parsing fails to stay safe
            print(f"Warning: Could not parse AI response for Chunk {chunk['id']}")
            suggested_tags = []

        # Update chunk
        chunk['tags'] = suggested_tags
        
        # Add to our "Done" list
        tagged_chunks.append(chunk)
        processed_ids.add(chunk['id'])
        
        print(f"Chunk {chunk['id']} tagged: {suggested_tags}")

        # --- SAVE IMMEDIATELY ---
        save_progress()

        # Sleep briefly to avoid hitting rate limits too hard
        time.sleep(0.5)

    except Exception as e:
        print(f"Error processing chunk {chunk['id']}: {e}")
        # We break or continue depending on preference. 
        # Continuing is safer so one error doesn't kill the batch.
        continue

print(f"\nDone! All chunks processed and saved to {OUTPUT_FILE}")

## Resume from previous (token exceeded)

In [1]:
import os
import json
import time
from dotenv import load_dotenv
from groq import Groq

# 1. Load Environment
load_dotenv()

# --- SWITCHING TO SECOND KEY ---
second_key = os.environ.get("SECONDGROQ_API")
if not second_key:
    raise ValueError("Error: 'SECONDGROQ_API' not found in .env file.")

print(f"Switched to SECONDGROQ_API (Key ends in ...{second_key[-4:]})")

client = Groq(
    api_key=second_key,
)

# 2. Define Files (Must match previous cell)
INPUT_FILE = "knowledge_base_clean.json"
TAGS_FILE = "glossary_tags.json"
OUTPUT_FILE = "knowledge_base_tagged.json"

# 3. Load Data & Glossary
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    all_chunks = json.load(f)

with open(TAGS_FILE, 'r', encoding='utf-8') as f:
    glossary = json.load(f)
    glossary_string = ", ".join(list(glossary.keys()))

# 4. RESUME LOGIC: Load what we have done so far
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        try:
            tagged_chunks = json.load(f)
            # Create a set of IDs that are ALREADY DONE
            processed_ids = {item['id'] for item in tagged_chunks}
            print(f"Resuming... found {len(processed_ids)} chunks already saved.")
        except json.JSONDecodeError:
            print("Output file corrupt or empty. Starting over.")
            tagged_chunks = []
            processed_ids = set()
else:
    tagged_chunks = []
    processed_ids = set()

# 5. Helper to Save
def save_progress():
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        # Save the FULL list (old + new)
        json.dump(tagged_chunks, f, indent=2, ensure_ascii=False)

# 6. Main Loop (Continuing from 289)
print(f"Starting processing remaining chunks...")

for chunk in all_chunks:
    # SKIP if we already did this one
    if chunk['id'] in processed_ids:
        continue

    # Skip tiny chunks
    chunk_text = chunk.get('text', '')
    if len(chunk_text) < 50:
        chunk['tags'] = []
        tagged_chunks.append(chunk)
        processed_ids.add(chunk['id'])
        continue

    # --- AI PROMPT ---
    prompt = f"""
    You are a data labeling assistant.
    
    TASK:
    Analyze the text below and identify which terms from the provided 'Glossary List' are highly relevant.
    
    RULES:
    1. STRICTLY output only a valid JSON list of strings (e.g., ["Term A", "Term B"]).
    2. If no terms match, output an empty list [].
    3. Do NOT invent new tags. Only use terms from the Glossary List.
    
    TEXT:
    "{chunk_text[:2000]}"
    
    GLOSSARY LIST:
    {glossary_string}
    """

    try:
        completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful JSON-only assistant."},
                {"role": "user", "content": prompt}
            ],
            model="openai/gpt-oss-120b",
            temperature=0.0,
        )

        response_content = completion.choices[0].message.content.strip()
        
        # Clean markdown wrappers if present
        if "```" in response_content:
            response_content = response_content.replace("```json", "").replace("```", "").strip()

        # Parse
        try:
            suggested_tags = json.loads(response_content)
        except:
            suggested_tags = []

        # Update & Append
        chunk['tags'] = suggested_tags
        tagged_chunks.append(chunk)
        processed_ids.add(chunk['id'])
        
        print(f"Chunk {chunk['id']} tagged: {suggested_tags}")

        # Save every time
        save_progress()
        
        # Sleep slightly to be nice to the new key
        time.sleep(0.5)

    except Exception as e:
        print(f"Error on chunk {chunk['id']}: {e}")
        # Stop loop if we hit another rate limit so you can fix it
        if "429" in str(e):
            print("Hit rate limit on Second Key! Stopping safely.")
            break
        continue

print(f"\nStopped. Total processed: {len(tagged_chunks)} / {len(all_chunks)}")

Switched to SECONDGROQ_API (Key ends in ...pDXd)
Resuming... found 580 chunks already saved.
Starting processing remaining chunks...
Chunk 580 tagged: ['moving average', 'MACD histogram', 'line chart']
Chunk 581 tagged: ['moving average', 'indicator']
Chunk 582 tagged: ['moving average', 'short-term trading', 'technical analysis']
Chunk 583 tagged: ['moving average', 'technical analysis', 'indicator', 'breakout']
Chunk 584 tagged: ['technical analysis', 'trendline', 'downtrend line']
Chunk 585 tagged: ['moving average', 'MACD histogram', 'indicator', 'technical analysis']
Chunk 587 tagged: ['MACD histogram', 'divergence', 'trend', 'indicator']
Chunk 589 tagged: ['MACD histogram', 'indicator']
Chunk 590 tagged: ['MACD histogram', 'indicator']
Chunk 591 tagged: ['MACD histogram', 'indicator']
Chunk 592 tagged: ['MACD histogram']
Chunk 593 tagged: ['MACD histogram', 'indicator']
Chunk 594 tagged: ['MACD histogram', 'spread']
Chunk 595 tagged: ['MACD histogram', 'shares']
Chunk 598 tagged: