## Scrapping

In [112]:
import os
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import re
from urllib.parse import urlparse
from datetime import datetime

In [113]:
DOWNLOAD_DIR = os.path.abspath("data")
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": DOWNLOAD_DIR,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome(
    service=Service(),
    options=chrome_options
) 
driver.get("https://fino.bank.in/regulatory/notices-policies")



In [114]:
ele = driver.find_element(By.CLASS_NAME, "policies-table")
body = ele.find_element(By.TAG_NAME, "tbody")

In [115]:
def extract_version(text):
    match = re.search(r'[vVVersion](\d+\.?\d*)', text)
    if match:
        return f"V{match.group(1)}" 
    
    return "V1"


In [116]:
def date_format(date_str):
    
    try:
        date_obj = datetime.strptime(date_str, "%d %b %Y")
        return date_obj.strftime("%d-%m-%Y")
    except ValueError:
        return date_str  # Return as is if format is unexpected

In [119]:
policy_file =[]
for row in body.find_elements(By.TAG_NAME, "tr"):
    cell = row.find_elements(By.TAG_NAME, "td")
    if len(cell) >= 4:
        
        name = cell[1].text
        version = extract_version(name)

        date = date_format(cell[2].text)
        
        try:
            download_url = cell[3].find_element(By.TAG_NAME, "a").get_attribute("href")
            version = extract_version(download_url)
            
            a = urlparse(download_url)
            filename = os.path.basename(a.path)
            try:
                driver.execute_script("window.open(arguments[0]);", download_url)
            except Exception as e:
                print(f"Error downloading {name}: {e}")

        except:
            download_url = None

        policy_data = {
            "sr": cell[0].text,
            "policy_name": name,
            "effective_date": date,
            "version": version,
            "file_path": filename,
            "download_url": download_url
        }

        policy_file.append(policy_data)
        
        with open("policy_details.jsonl", "a") as f:
            f.write(json.dumps(policy_data) + "\n")
        

