## Scrapping

In [1]:
import os
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import re
from urllib.parse import urlparse
from datetime import datetime

In [113]:
DOWNLOAD_DIR = os.path.abspath("data")
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": DOWNLOAD_DIR,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome(
    service=Service(),
    options=chrome_options
) 
driver.get("https://fino.bank.in/regulatory/notices-policies")



In [114]:
ele = driver.find_element(By.CLASS_NAME, "policies-table")
body = ele.find_element(By.TAG_NAME, "tbody")

In [115]:
def extract_version(text):
    match = re.search(r'[vVVersion](\d+\.?\d*)', text)
    if match:
        return f"V{match.group(1)}" 
    
    return "V1"


In [116]:
def date_format(date_str):
    
    try:
        date_obj = datetime.strptime(date_str, "%d %b %Y")
        return date_obj.strftime("%d-%m-%Y")
    except ValueError:
        return date_str  # Return as is if format is unexpected

In [119]:
policy_file =[]
for row in body.find_elements(By.TAG_NAME, "tr"):
    cell = row.find_elements(By.TAG_NAME, "td")
    if len(cell) >= 4:
        
        name = cell[1].text
        version = extract_version(name)

        date = date_format(cell[2].text)
        
        try:
            download_url = cell[3].find_element(By.TAG_NAME, "a").get_attribute("href")
            version = extract_version(download_url)
            
            a = urlparse(download_url)
            filename = os.path.basename(a.path)
            try:
                driver.execute_script("window.open(arguments[0]);", download_url)
            except Exception as e:
                print(f"Error downloading {name}: {e}")

        except:
            download_url = None

        policy_data = {
            "sr": cell[0].text,
            "policy_name": name,
            "effective_date": date,
            "version": version,
            "file_path": filename,
            "download_url": download_url
        }

        policy_file.append(policy_data)
        
        with open("policy_details.jsonl", "a") as f:
            f.write(json.dumps(policy_data) + "\n")
        



## FAQ Extraction

In [18]:
faq_driver = webdriver.Chrome() 
faq_driver.get("https://docs.fino.bank.in/faqs")

In [19]:
all_topic = faq_driver.find_elements(By.CLASS_NAME, "card")

In [20]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(faq_driver, 10)

# 2. Target the 'a' tag specifically inside the AccountsHeading0 div
# We use the CSS Selector to find the <a> tag that is a child of the header id
expand_button = wait.until(EC.element_to_be_clickable(
    (By.CSS_SELECTOR, "#AccountsHeading0 a[role='button']")
))

In [21]:
import time

In [None]:
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://docs.fino.bank.in/faqs")
time.sleep(3)

faq_pipeline = []

# 1. Target ALL top-level cards (ACCOUNTS, CARDS, PAYMENTS, etc.)
main_cards = driver.find_elements(By.CSS_SELECTOR, "#faqs-accordion > .card")

for i in range(len(main_cards)):
    # Refresh references to avoid StaleElementReferenceException
    current_main = driver.find_elements(By.CSS_SELECTOR, "#faqs-accordion > .card")[i]
    
    # Get Main Category name and Expand (the '^' icon)
    main_btn = current_main.find_element(By.CSS_SELECTOR, ".card-header a")
    main_name = main_btn.text.strip()
    
    if main_btn.get_attribute("aria-expanded") == "false":
        driver.execute_script("arguments[0].click();", main_btn)
        time.sleep(1)

    # 2. Find ONLY the sub-cards inside THIS main card's body
    # This prevents the script from skipping categories like CARDS or PAYMENTS
    sub_cards = current_main.find_elements(By.CSS_SELECTOR, ".card-body .card")
    
    for sub_card in sub_cards:
        sub_btn = sub_card.find_element(By.CSS_SELECTOR, ".card-header a")
        sub_name = sub_btn.text.strip()
        
        # Expand the sub-question (+)
        if sub_btn.get_attribute("aria-expanded") == "false":
            driver.execute_script("arguments[0].click();", sub_btn)
            time.sleep(0.5)

        # 3. EXTRACTION: Find the visible collapse div under THIS sub-card
        try:
            # We look for the div that has class 'collapse' and is currently 'show'
            content_div = sub_card.find_element(By.CSS_SELECTOR, "div.collapse.show")
            
            # This captures EVERYTHING (li, h6, p, spans) as clean text
            content_text = content_div.text.strip()
            
            faq_pipeline.append({
                "main_category": main_name,
                "sub_category": sub_name,
                "content": content_text
            })
            print(f"✅ Extracted: {main_name} > {sub_name}")
            
        except Exception:
            print(f"❌ Failed content for: {sub_name}")

# Save final knowledge base
with open("fino_complete_faqs.json", "w", encoding="utf-8") as f:
    json.dump(faq_pipeline, f, indent=4)

driver.quit()

InvalidSelectorException: Message: invalid selector: Unable to locate an element with the xpath expression //strong | //b | h3 | h4 | [class*='question'] because of the following error:
SyntaxError: Failed to execute 'evaluate' on 'Document': The string '//strong | //b | h3 | h4 | [class*='question']' is not a valid XPath expression.
  (Session info: chrome=143.0.7499.170); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidselectorexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff667e388e5
	0x7ff667e38940
	0x7ff667c1165d
	0x7ff667c192cc
	0x7ff667c1c3b4
	0x7ff667c1c47f
	0x7ff667c692e5
	0x7ff667c69d3c
	0x7ff667cbdf97
	0x7ff667cbac97
	0x7ff667c5ac29
	0x7ff667c5ba93
	0x7ff668150640
	0x7ff66814af80
	0x7ff6681696e6
	0x7ff667e55de4
	0x7ff667e5ed8c
	0x7ff667e42004
	0x7ff667e421b5
	0x7ff667e27ee2
	0x7ffed209e8d7
	0x7ffed346c53c


In [5]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# 1. Setup Driver
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("https://docs.fino.bank.in/faqs")
time.sleep(5) 

output_filename = "fino_full_faqs.txt"

# Target the main category cards (Accounts, Cards, etc.)
main_selector = "#faqs-accordion > .card"
main_cards_count = len(driver.find_elements(By.CSS_SELECTOR, main_selector))

with open(output_filename, "w", encoding="utf-8") as f:
    for i in range(main_cards_count):
        # Re-fetch main cards to avoid stale elements
        main_cards = driver.find_elements(By.CSS_SELECTOR, main_selector)
        current_main = main_cards[i]
        
        # Get Main Category Title and Expand
        main_header = current_main.find_element(By.CSS_SELECTOR, ".card-header a")
        main_title = main_header.text.strip()
        
        print(f"\n>>> Processing Category: {main_title}")
        
        if main_header.get_attribute("aria-expanded") == "false":
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", main_header)
            driver.execute_script("arguments[0].click();", main_header)
            time.sleep(2) # Wait for sub-categories to load

        # 2. Find all Sub-category cards (+) inside this main category
        sub_cards = current_main.find_elements(By.CSS_SELECTOR, ".card-body .card")
        
        for sub in sub_cards:
            # Get Sub-category Title
            sub_btn = sub.find_element(By.CSS_SELECTOR, ".card-header a")
            sub_title = sub_btn.text.strip()
            
            # Expand Sub-category
            if sub_btn.get_attribute("aria-expanded") == "false":
                driver.execute_script("arguments[0].click();", sub_btn)
                time.sleep(1) # CRITICAL: Wait for the text to actually appear

            # 3. Copy the specific Sub-category title and its expanded text
            try:
                # Target the newly expanded content area
                content_area = sub.find_element(By.CSS_SELECTOR, ".collapse")
                content_text = content_area.get_attribute("innerText").strip()
                
                # Write to .txt file
                f.write(f"MAIN CATEGORY: {main_title}\n")
                f.write(f"SUB-CATEGORY: {sub_title}\n")
                f.write(f"DETAILS:\n{content_text}\n")
                f.write("-" * 40 + "\n\n")
                
                print(f"   ✓ Copied: {sub_title}")
                
            except Exception as e:
                print(f"   ✗ Failed to copy {sub_title}: {e}")

        # Collapse the main card before moving to the next
        driver.execute_script("arguments[0].click();", main_header)
        time.sleep(1)

print(f"\nDone! All text saved to {output_filename}")
driver.quit()


>>> Processing Category: ACCOUNTS & DEPOSITS
   ✓ Copied: Minor Savings Account
   ✓ Copied: Jan Savings Account
   ✓ Copied: Aarambh Savings Account
   ✓ Copied: Sampann Current Account
   ✓ Copied: Account Opening Process
   ✓ Copied: Account Maintenance Process
   ✓ Copied: Savings account
   ✓ Copied: Current Account
   ✓ Copied: Fund Transfer
   ✓ Copied: Account Settlement – Deceased case

>>> Processing Category: CARDS
   ✓ Copied: 
   ✓ Copied: 

>>> Processing Category: PAYMENTS
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 

>>> Processing Category: FASTAG
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 
   ✓ Copied: 

>>> Processing Category: NETBANKING
  

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.maximize_window()
driver.get("https://docs.fino.bank.in/faqs")
time.sleep(5) 

output_filename = "fino_full_qa_dump.txt"
main_selector = "#faqs-accordion > .card"

with open(output_filename, "w", encoding="utf-8") as f:
    main_cards_count = len(driver.find_elements(By.CSS_SELECTOR, main_selector))
    
    for i in range(main_cards_count):
        main_cards = driver.find_elements(By.CSS_SELECTOR, main_selector)
        current_main = main_cards[i]
        
        main_header = current_main.find_element(By.CSS_SELECTOR, ".card-header a")
        main_title = main_header.text.strip()
        
        print(f"\nExtracting: {main_title}")
        
        # Open Main Card
        if main_header.get_attribute("aria-expanded") == "false":
            driver.execute_script("arguments[0].click();", main_header)
            time.sleep(2)

        sub_cards = current_main.find_elements(By.CSS_SELECTOR, ".card-body .card")
        
        for sub in sub_cards:
            sub_btn = sub.find_element(By.CSS_SELECTOR, ".card-header a")
            sub_title = sub_btn.text.strip()
            
            # Open Sub Card
            if sub_btn.get_attribute("aria-expanded") == "false":
                driver.execute_script("arguments[0].click();", sub_btn)
                time.sleep(1.5) # Wait for animation to finish

            try:
                # Target the exact div that contains the h6 (question) and p (answer)
                # Usually .card-body inside the .collapse div
                content_box = sub.find_element(By.CSS_SELECTOR, ".collapse .card-body")
                # Using textContent via JS often gets text that .text misses
                full_content = driver.execute_script("return arguments[0].textContent;", content_box).strip()
                
                # Write to file
                f.write(f"--- {main_title} | {sub_title} ---\n")
                f.write(full_content + "\n")
                f.write("="*60 + "\n\n")
                
                print(f"   ✓ Captured Q&A for: {sub_title}")
                
            except Exception:
                # Fallback: if .card-body fails, try the whole .collapse div
                try:
                    content_box = sub.find_element(By.CSS_SELECTOR, ".collapse")
                    full_content = content_box.get_attribute("innerText").strip()
                    f.write(f"--- {main_title} | {sub_title} ---\n")
                    f.write(full_content + "\n")
                    f.write("="*60 + "\n\n")
                except:
                    print(f"   ✗ Could not find content for: {sub_title}")

        # Collapse main card
        driver.execute_script("arguments[0].click();", main_header)
        time.sleep(1)

driver.quit()


Extracting: ACCOUNTS & DEPOSITS
   ✗ Could not find content for: Minor Savings Account
   ✗ Could not find content for: Jan Savings Account
   ✗ Could not find content for: Aarambh Savings Account
   ✗ Could not find content for: Sampann Current Account
   ✗ Could not find content for: Account Opening Process
   ✗ Could not find content for: Account Maintenance Process
   ✗ Could not find content for: Savings account
   ✗ Could not find content for: Current Account
   ✗ Could not find content for: Fund Transfer
   ✗ Could not find content for: Account Settlement – Deceased case

Extracting: CARDS
   ✗ Could not find content for: 
   ✗ Could not find content for: 

Extracting: PAYMENTS
   ✗ Could not find content for: 
   ✗ Could not find content for: 
   ✗ Could not find content for: 
   ✗ Could not find content for: 

Extracting: FASTAG
   ✗ Could not find content for: 
   ✗ Could not find content for: 
   ✗ Could not find content for: 
   ✗ Could not find content for: 
   ✗ Could no

KeyboardInterrupt: 