In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_rendered_html(url, wait_selector=None, wait_time=15):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    # Todo - Change the path to windows path
    service = Service(r"c:\Users\Nachappa\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    try:
        driver.get(url)
        if wait_selector:
            WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, wait_selector))
            )
        rendered_html = driver.page_source
    except Exception as ex:
        print(ex)
    finally:
        driver.quit()
    return rendered_html

In [1]:
import markdownify
import re
from bs4 import BeautifulSoup

class EmptyExtraction(Exception):
    pass

def remove_ui_blocks_general(text):
    cleaned_lines = []
    skip = False
    
    # keywords that usually indicate UI junk/login/footer
    junk_keywords = [
        'apply now', 'forgot password', 'request reset', 
         '×', 'loading', 'javascript',

        # UI prompts
        "subscribe", "unsubscribe", "notification", "alert", "popup",
        "scroll to","faq","send otp", "resend otp", "password", "create password",
        "change password", "reset password", "confirm password",
        "update your profile", "complete your profile","sign in", "sign up", "logout", "otp",
        "profile is currently under moderation",
        "haven't received otp", "congratulations", "your password has been changed",
        "thank you for subscribing", "don't have an account",
        "character","lowercase","uppercase", "digit","email id","email address",
        

        # Numbers / stats
        "users have visited", "last updated", "toll free", "working hrs"

    ]
    
    for line in text.splitlines():
        stripped = line.strip().lower()
        
        # Skip lines that match UI patterns
        if any(keyword in stripped for keyword in junk_keywords):
            skip = True
            continue
        
        # Stop skipping if line looks like main content (long enough, not just symbols)
        if skip and len(stripped) > 15:
            skip = False
        
        if not skip:
            # Skip empty lines or lines with only symbols (*, +, -)
            if re.match(r'^[\*\+\-\s]+$', stripped):
                continue
            cleaned_lines.append(line)
            skip=True
    
    return "\n".join(cleaned_lines)

def Clean_Markdown(url):
    
    try:
        rendered_html = fetch_rendered_html(url)
    except Exception as e:
        print(f"Error fetching HTML from {url}: {e}")
        return ""

    try:
        soup = BeautifulSoup(rendered_html, "html.parser")
        text_len = len(soup.get_text(strip=True))
        if text_len < 50:  # threshold you decide
            raise EmptyExtraction(f"Extraction failed for {url}. HTML content is junk/empty.")
    except EmptyExtraction as e:
        print(e)
        return ""
        
        
    
    try:
        markdown_string = markdownify.markdownify(rendered_html, heading_style='ATX')
    except Exception as e:
        print(f"Failed to convert HTML to Markdown : {e}") 

    
    
    # 1. Remove both markdown images (![...](...)) and links ([...](...))
    
    # Removes
    # ![Seed Fund Need](/static/media/seedfund-need.png)
    # 
    # [](/static/media/Launch.5a3470a4.mp4)
    # [More Details](/about)
    link_pattern = r"""
        (?:!?\[[^\]]*\]\([^)]*\))   # normal markdown image/link: [text](url) or ![alt](url)
      | (?:\]\([^)]*\))             # broken shorthand: ](url)
    """
    removed_link = re.sub(link_pattern, "", markdown_string,flags=re.VERBOSE)
    
    
    # 2. Remove Footer
    footer_keywords = [
            "About", "Help", "Join", "Subscribe", "Follow",
            "Terms of Use", "Privacy Policy", "Disclaimer", "Copyright"
        ]
    
    footer_pattern =r"\n###### (?:" + "|".join(map(re.escape, footer_keywords)) + r").*"
    
    removed_footer = re.sub(footer_pattern , "", removed_link,flags=re.DOTALL)


    # 3. Remove known UI junk blocks
    ui_block_patterns = [
        r"(?s)please (?:enter|change).*?submit",
        r"(?s)your password must.*?submit",
        r"(?s)notification alert.*?(yes|no)",
        r"(?s)do you really want to logout.*?(yes|no)",
    ]
    for pat in ui_block_patterns:
        removed_ui_junk = re.sub(pat, "", removed_footer, flags=re.IGNORECASE)
    
    
    # 4. Removes Majority of Header, lines with only symbols (*, +, -) and junk words 
    removed_header_junk = remove_ui_blocks_general(removed_ui_junk)

    # 5. Remove numeric/symbol junk lines
    symbol_noise = [
    r"^\s*©.*$",                     # any line starting with ©
    r"^\s*\d{1,3}(?:[,\d]+)*\s*$",   # pure numeric lines
    r"^\s*\d+\s*/\s*\d+\s*$",        # pagination like 3/12
    ]
    removed_numeric_junk = removed_header_junk
    for pat in symbol_noise:
        removed_numeric_junk = re.sub(pat, "", removed_numeric_junk, flags=re.MULTILINE)
    
    
    # 4. Normalize spaces and newlines for cleaner formatting.
    lines = [line for line in removed_numeric_junk.splitlines() if line.strip()]
    completely_cleaned = "\n".join(lines)

    display(set(markdown_string.splitlines()) - set(completely_cleaned.splitlines()))
    return completely_cleaned

In [None]:
def get_clean_markdown_with_retry(url, min_length=50, max_retries=5):
    """
    Keep extracting markdown until it's not too short, 
    or until max_retries is reached.
    """
    attempts = 0
    markdown_content = ""

    while attempts < max_retries:
        markdown_content = Clean_Markdown(url)   # <- your function
        if markdown_content and len(markdown_content.strip()) >= min_length:
            return markdown_content  # ✅ good content
        else:
            print(f"⚠️ Attempt {attempts+1}: Content too small for {url}, retrying...")
            attempts += 1
    
    print(f"❌ Failed to extract enough content from {url} after {max_retries} retries.")
    return markdown_content  # might still be small, but we return the last try

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
import uuid
from datetime import datetime


def process_and_store_urls(urls, collection_name="startup_india_data"):

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
        ("#####", "Header 5"),
        ("######", "Header 6")
    ]

    sentence_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ". ", "!", "?", "* "]
    )

    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # ---------------- Milvus Setup ----------------
    connections.connect("default", host="localhost", port="19530")

    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)

    fields = [
        FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, is_primary=True, max_length=36),
        FieldSchema(name="chunk_sequence", dtype=DataType.INT64),
        FieldSchema(name="chunk_text", dtype=DataType.VARCHAR, max_length=65535),
        FieldSchema(name="chunk_embed", dtype=DataType.FLOAT_VECTOR, dim=384),
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=36),
        FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=255),
        FieldSchema(name="doc_timestamp", dtype=DataType.VARCHAR, max_length=20)
    ]
    schema = CollectionSchema(fields, "Startup India website scraped data")
    collection = Collection(collection_name, schema)

    # ---------------- Processing Loop ----------------
    all_combined_data = []

    for url in urls:
        try:
            # Step 1: Fetch and clean markdown for URL
            markdown_content = get_clean_markdown_with_retry(url, min_length=50, max_retries=5)
            if not markdown_content or len(markdown_content.strip()) < 50:
                 continue  # final fallback if all retries fail i.e skip it

            # Step 2: Split with headers
            header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
            header_docs = header_splitter.split_text(markdown_content)

            doc_id = str(uuid.uuid4())
            doc_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            chunks, metadatas = [], []

            for i, doc in enumerate(header_docs):
                sub_chunks = sentence_splitter.split_text(doc.page_content)
                header = "\n".join([f"{k}: {v}" for k, v in doc.metadata.items()]) if doc.metadata else ""
                for sub_chunk in sub_chunks:
                    chunk_text = f"{header}\n{sub_chunk}".strip() if header else sub_chunk
                    chunks.append(chunk_text)
                    metadatas.append({
                        "chunk_id": str(uuid.uuid4()),
                        "chunk_sequence": i,
                        "doc_id": doc_id,
                        "url": url,
                        "doc_timestamp": doc_timestamp
                    })

            if not chunks:
                print(f"⚠️ No chunks extracted for {url}")
                continue

            # Step 3: Generate embeddings
            chunk_embeddings = embeddings.embed_documents(chunks)

            # Step 4: Combine data
            combined_data = [
                {
                    "chunk_id": meta["chunk_id"],
                    "chunk_sequence": meta["chunk_sequence"],
                    "chunk_text": chunk,
                    "chunk_embed": embedding,
                    "doc_id": meta["doc_id"],
                    "url": meta["url"],
                    "doc_timestamp": meta["doc_timestamp"]
                }
                for chunk, embedding, meta in zip(chunks, chunk_embeddings, metadatas)
            ]

            all_combined_data.extend(combined_data)
            print(f"✅ Processed {url} ({len(chunks)} chunks)")

        except Exception as e:
            print(f"❌ Error processing {url}: {e}")

    # ---------------- Insert into Milvus ----------------
    if all_combined_data:
        collection.insert(all_combined_data)
        collection.create_index(
            field_name="chunk_embed",
            index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
        )
        collection.create_index(field_name="doc_id", index_params={"index_type": "TRIE"})
        collection.load()
        print(f"🎉 Inserted {len(all_combined_data)} chunks from {len(urls)} docs into {collection_name}")
    else:
        print("⚠️ No data to insert.")


# ---------------- Example Usage ----------------
urls = [
    "https://www.startupindia.gov.in/",
    "https://seedfund.startupindia.gov.in/",
    "https://www.sidbivcf.in/en/funds/ffs",
]

process_and_store_urls(urls)


In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
import uuid
from datetime import datetime
import time
import requests

# Assume Clean_Markdown(url) is defined elsewhere and returns cleaned Markdown string
# Example: def Clean_Markdown(url): ... return markdown_content

def get_clean_markdown_with_retry(url, min_length=10, max_retries=5, retry_delay=2):
    """Retry Clean_Markdown(url) with specified retries and validate content length."""
    for attempt in range(max_retries):
        try:
            markdown_content = Clean_Markdown(url)
            if markdown_content and len(markdown_content.strip()) >= min_length:
                return markdown_content
            print(f"⚠️ Content too short for {url} (length: {len(markdown_content.strip() if markdown_content else '')}), retrying...")
        except (requests.RequestException, ValueError) as e:
            print(f"⚠️ Attempt {attempt + 1}/{max_retries} failed for {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
    return ""  # Return empty string if all retries fail

def process_and_store_urls(urls, collection_name="startup_india_data"):
    """Process URLs, chunk Markdown content, generate embeddings, and store in Milvus."""
    # Define splitters and embeddings
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
        ("#####", "Header 5"),
        ("######", "Header 6")
    ]
    sentence_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Increased for longer paragraphs
        chunk_overlap=50,
        separators=["\n\n", "\n", ". ", "!", "?", "* "]
    )
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # Milvus setup
    try:
        connections.connect("default", host="localhost", port="19530")
        print("Connected to Milvus server.")
    except Exception as e:
        print(f"❌ Failed to connect to Milvus: {e}")
        return

    # Create or recreate collection
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
    
    fields = [
        FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, is_primary=True, max_length=36),
        FieldSchema(name="chunk_sequence", dtype=DataType.INT64),
        FieldSchema(name="chunk_text", dtype=DataType.VARCHAR, max_length=65535),
        FieldSchema(name="chunk_embed", dtype=DataType.FLOAT_VECTOR, dim=384),
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=36),
        FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=255),
        FieldSchema(name="doc_timestamp", dtype=DataType.VARCHAR, max_length=20)
    ]
    schema = CollectionSchema(fields, "Startup India website scraped data")
    collection = Collection(collection_name, schema)

    # Process URLs
    total_chunks = 0
    for url in urls:
        try:
            # Fetch and clean markdown
            markdown_content = get_clean_markdown_with_retry(url, min_length=10, max_retries=5)
            if not markdown_content or len(markdown_content.strip()) < 10:
                print(f"⚠️ Skipping {url}: No valid content (length: {len(markdown_content.strip() if markdown_content else '')})")
                continue

            # Split with headers
            header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
            header_docs = header_splitter.split_text(markdown_content)
            doc_id = str(uuid.uuid4())  # Unique doc_id per URL
            doc_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            chunks, metadatas = [], []

            # Split content into sub-chunks
            for i, doc in enumerate(header_docs):
                sub_chunks = sentence_splitter.split_text(doc.page_content)
                header = "\n".join([f"{k}: {v}" for k, v in doc.metadata.items()]) if doc.metadata else ""
                for sub_chunk in sub_chunks:
                    chunk_text = f"{header}\n{sub_chunk}".strip() if header else sub_chunk
                    chunks.append(chunk_text)
                    metadatas.append({
                        "chunk_id": str(uuid.uuid4()),
                        "chunk_sequence": i,
                        "doc_id": doc_id,
                        "url": url,
                        "doc_timestamp": doc_timestamp
                    })

            if not chunks:
                print(f"⚠️ No chunks extracted for {url}")
                continue

            # Generate embeddings
            chunk_embeddings = embeddings.embed_documents(chunks)

            # Combine data
            combined_data = [
                {
                    "chunk_id": meta["chunk_id"],
                    "chunk_sequence": meta["chunk_sequence"],
                    "chunk_text": chunk,
                    "chunk_embed": embedding,
                    "doc_id": meta["doc_id"],
                    "url": meta["url"],
                    "doc_timestamp": meta["doc_timestamp"]
                }
                for chunk, embedding, meta in zip(chunks, chunk_embeddings, metadatas)
            ]

            # Insert per URL to manage memory
            collection.insert(combined_data)
            total_chunks += len(chunks)
            print(f"✅ Processed {url} ({len(chunks)} chunks, doc_id: {doc_id})")

        except (requests.RequestException, ValueError) as e:
            print(f"❌ Error processing {url}: {e}")
            continue

    # Create indexes and load collection
    if total_chunks > 0:
        collection.create_index(
            field_name="chunk_embed",
            index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
        )
        collection.create_index(field_name="doc_id", index_params={"index_type": "TRIE"})
        collection.load()
        print(f"🎉 Inserted {total_chunks} chunks from {len(urls)} URLs into {collection_name}")
    else:
        print("⚠️ No data inserted into collection.")

In [None]:
urls = [
        "https://www.startupindia.gov.in/content/sih/en/home-page.html",
        "https://www.startupindia.gov.in/content/sih/en/startup.html",
        "https://www.startupindia.gov.in/content/sih/en/schemes.html"
    ]
process_and_store_urls(urls)