In [5]:
import os
import time
import requests
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from webdriver_manager.chrome import ChromeDriverManager

# ---------------- CONFIG ----------------
PDF_DIR = "judgments_pdfs"
CSV_FILE = "judgments_data.csv"
os.makedirs(PDF_DIR, exist_ok=True)
# ---------------------------------------

# Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # remove to see browser
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")

# Launch Chrome with webdriver-manager (automatic driver)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Open Rajasthan HC Judgments page
url = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
driver.get(url)

wait = WebDriverWait(driver, 20)  # wait up to 20 sec for elements

# ---------------- Switch to iframe ----------------
iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
driver.switch_to.frame(iframe)

# ---------------- Fill the form ----------------
to_date = datetime.today().strftime("%d-%m-%Y")
from_date = (datetime.today() - timedelta(days=10)).strftime("%d-%m-%Y")

from_date_elem = wait.until(EC.presence_of_element_located((By.ID, "txtFromDate")))
to_date_elem = wait.until(EC.presence_of_element_located((By.ID, "txtToDate")))
report_elem = wait.until(EC.presence_of_element_located((By.ID, "ddlReportableJudgment")))

from_date_elem.send_keys(from_date)
to_date_elem.send_keys(to_date)
report_elem.send_keys("YES")

# ---------------- Handle captcha ----------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "imgCaptcha")))
captcha_img_path = "captcha.png"
captcha_img.screenshot(captcha_img_path)
Image.open(captcha_img_path).show()
captcha_text = input("Enter Captcha: ")

driver.find_element(By.ID, "txtCaptcha").send_keys(captcha_text)
driver.find_element(By.ID, "btnSubmit").click()

# ---------------- Scrape table + download PDFs ----------------
rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table tbody tr")))
data = []

for r in rows:
    cols = r.find_elements(By.TAG_NAME, "td")
    if not cols:
        continue

    case_no = cols[0].text.strip()
    case_title = cols[1].text.strip()
    judge = cols[2].text.strip()
    date_judg = cols[3].text.strip()
    pdf_link = cols[-1].find_element(By.TAG_NAME, "a").get_attribute("href")

    pdf_name = f"{case_no.replace('/', '_')}_{date_judg}.pdf"
    pdf_path = os.path.join(PDF_DIR, pdf_name)

    if not os.path.exists(pdf_path):
        pdf_content = requests.get(pdf_link).content
        with open(pdf_path, "wb") as f:
            f.write(pdf_content)

    data.append({
        "CaseNo": case_no,
        "CaseTitle": case_title,
        "Judge": judge,
        "Date": date_judg,
        "PDF_Link": pdf_link,
        "PDF_Name": pdf_name
    })

# ---------------- Save to CSV incrementally ----------------
df_new = pd.DataFrame(data)

if os.path.exists(CSV_FILE):
    df_old = pd.read_csv(CSV_FILE)
    df_final = pd.concat([df_old, df_new]).drop_duplicates(subset=["CaseNo", "Date"])
else:
    df_final = df_new

df_final.to_csv(CSV_FILE, index=False)
print("Data saved to:", CSV_FILE)
print(df_final.head())

driver.quit()


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x0x9cd2a3+66419]
	GetHandleVerifier [0x0x9cd2e4+66484]
	(No symbol) [0x0x7a4bd3]
	(No symbol) [0x0x7ee958]
	(No symbol) [0x0x7eecfb]
	(No symbol) [0x0x835152]
	(No symbol) [0x0x811064]
	(No symbol) [0x0x8328a1]
	(No symbol) [0x0x810e16]
	(No symbol) [0x0x7e25ce]
	(No symbol) [0x0x7e34a4]
	GetHandleVerifier [0x0xc15ee3+2461619]
	GetHandleVerifier [0x0xc10f66+2441270]
	GetHandleVerifier [0x0x9f6242+234258]
	GetHandleVerifier [0x0x9e6208+168664]
	GetHandleVerifier [0x0x9ed1ad+197245]
	GetHandleVerifier [0x0x9d55f8+100040]
	GetHandleVerifier [0x0x9d5792+100450]
	GetHandleVerifier [0x0x9bf74a+10266]
	BaseThreadInitThunk [0x0x768e5d49+25]
	RtlInitializeExceptionChain [0x0x772dd2fb+107]
	RtlGetAppContainerNamedObjectPath [0x0x772dd281+561]


In [12]:
import os
import time
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image

# ---------------- CONFIG ----------------
PDF_DIR = "judgments_pdfs"
CSV_FILE = "judgments_data.csv"
URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"

# Make directories if not exist
os.makedirs(PDF_DIR, exist_ok=True)

# ---------------- WebDriver ----------------
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")   # remove if you want to see browser
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(r"C:\Users\shubham\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"), options=chrome_options)

wait = WebDriverWait(driver, 20)

driver.get(URL)

# ---------------- Dates ----------------
to_date = datetime.today()
from_date = to_date - timedelta(days=10)

from_date_str = from_date.strftime("%d/%m/%Y")
to_date_str = to_date.strftime("%d/%m/%Y")

# ---------------- Fill Form ----------------
wait.until(EC.presence_of_element_located((By.ID, "partyFromDate"))).clear()
driver.find_element(By.ID, "partyFromDate").send_keys(from_date_str)

wait.until(EC.presence_of_element_located((By.ID, "partyToDate"))).clear()
driver.find_element(By.ID, "partyToDate").send_keys(to_date_str)

# Reportable Judgment = Yes
driver.find_element(By.ID, "rpjudgeY").click()

# ---------------- Captcha Handling ----------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "ContentPlaceHolder1_imgCaptcha")))
captcha_path = "captcha.png"
captcha_img.screenshot(captcha_path)
Image.open(captcha_path).show()
captcha_text = input("Enter Captcha from captcha.png: ")

captcha_input = driver.find_element(By.ID, "ContentPlaceHolder1_txtCaptcha")
captcha_input.send_keys(captcha_text)

driver.find_element(By.ID, "ContentPlaceHolder1_btnSubmit").click()

# ---------------- Wait for table ----------------
table = wait.until(EC.presence_of_element_located((By.ID, "ContentPlaceHolder1_GridView1")))

# ---------------- Scrape Table ----------------
rows = table.find_elements(By.TAG_NAME, "tr")
data = []

# Load previously downloaded CSV if exists
if os.path.exists(CSV_FILE):
    existing_df = pd.read_csv(CSV_FILE)
    downloaded_links = set(existing_df['PDF_Link'].tolist())
else:
    downloaded_links = set()

for row in rows[1:]:  # skip header
    cols = row.find_elements(By.TAG_NAME, "td")
    if len(cols) < 2:
        continue
    row_data = {
        "Case_No": cols[0].text.strip(),
        "Case_Title": cols[1].text.strip(),
        "PDF_Link": cols[1].find_element(By.TAG_NAME, "a").get_attribute("href")
    }
    
    # Skip if already downloaded
    if row_data["PDF_Link"] in downloaded_links:
        continue

    # Download PDF
    pdf_name = os.path.join(PDF_DIR, row_data["Case_No"].replace("/", "_") + ".pdf")
    driver.get(row_data["PDF_Link"])
    with open(pdf_name, "wb") as f:
        f.write(driver.find_element(By.TAG_NAME, "embed").screenshot_as_png)
    
    row_data["PDF_File"] = pdf_name
    data.append(row_data)

# ---------------- Save CSV ----------------
if os.path.exists(CSV_FILE):
    df = pd.concat([existing_df, pd.DataFrame(data)], ignore_index=True)
else:
    df = pd.DataFrame(data)

df.to_csv(CSV_FILE, index=False)
print(f"Scraped {len(data)} new judgments. Data saved to {CSV_FILE}")

driver.quit()


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff76496fc95+79861]
	GetHandleVerifier [0x0x7ff76496fcf0+79952]
	(No symbol) [0x0x7ff7646ecada]
	(No symbol) [0x0x7ff764744457]
	(No symbol) [0x0x7ff76474471c]
	(No symbol) [0x0x7ff764798217]
	(No symbol) [0x0x7ff76476cb1f]
	(No symbol) [0x0x7ff764794f8b]
	(No symbol) [0x0x7ff76476c8b3]
	(No symbol) [0x0x7ff764735272]
	(No symbol) [0x0x7ff764736043]
	GetHandleVerifier [0x0x7ff764c2b9cd+2946349]
	GetHandleVerifier [0x0x7ff764c25c4a+2922410]
	GetHandleVerifier [0x0x7ff764c459d7+3052855]
	GetHandleVerifier [0x0x7ff76498aa7e+189918]
	GetHandleVerifier [0x0x7ff764992a1f+222591]
	GetHandleVerifier [0x0x7ff764978ab4+116244]
	GetHandleVerifier [0x0x7ff764978c69+116681]
	GetHandleVerifier [0x0x7ff76495f048+11176]
	BaseThreadInitThunk [0x0x7ffc2fd9e8d7+23]
	RtlUserThreadStart [0x0x7ffc30e5c34c+44]


In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time

# ------------------- CONFIG -------------------

URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"  # <-- update this

# ------------------- CHROME OPTIONS -------------------
chrome_options = Options()
chrome_options.add_argument("--headless")  # remove if you want to see browser
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(r"C:\Users\shubham\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"), options=chrome_options)
wait = WebDriverWait(driver, 20)

driver.get(URL)

# ------------------- FILL DATES -------------------
from_date = wait.until(EC.presence_of_element_located((By.ID, "partyFromDate")))
to_date = wait.until(EC.presence_of_element_located((By.ID, "partyToDate")))
from_date.clear()
from_date.send_keys("01/09/2025")
to_date.clear()
to_date.send_keys("11/09/2025")

# ------------------- SELECT CATEGORY -------------------
# category_select = Select(driver.find_element(By.ID, "casebasetype"))
# category_select.select_by_visible_text("Civil")  # Change as needed

time.sleep(1)  # wait for Case Type dropdown to populate

# ------------------- SELECT CASE TYPE -------------------
# case_type_select = Select(driver.find_element(By.ID, "casetype"))
# case_type_select.select_by_index(1)  # select first option

# ------------------- PET/RES -------------------
# petres_select = Select(driver.find_element(By.ID, "partypetres1"))
# petres_select.select_by_visible_text("Both")  # change if needed

# ------------------- JUDGE -------------------
judge_select = Select(driver.find_element(By.ID, "judgeCode1"))
judge_select.select_by_index(1)  # select first judge

# ------------------- BENCH -------------------
# bench_radio = driver.find_element(By.ID, "bench")  # "All" option
# bench_radio.click()

# ------------------- PIL JUDGEMENT -------------------
# pil_radio = driver.find_element(By.ID, "pilJudgeA")  # "All"
# pil_radio.click()

# ------------------- JUDGEMENT / ORDER -------------------
# jud_order_radio = driver.find_element(By.ID, "judorderA")  # "All"
# jud_order_radio.click()

# ------------------- REPORTABLE JUDGEMENT -------------------
rp_judge_radio = driver.find_element(By.ID, "rpjudgeY")  # "All"
rp_judge_radio.click()

# ------------------- CAPTCHA -------------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "captcha")))
captcha_img.screenshot("captcha.png")
print("Captcha saved as captcha.png. Please open and enter the code.")

captcha_input = wait.until(EC.presence_of_element_located((By.ID, "txtCaptcha")))
captcha_code = input("Enter captcha code from captcha.png: ")
captcha_input.send_keys(captcha_code)

# ------------------- SUBMIT -------------------
submit_btn = driver.find_element(By.ID, "btncasedetail1_1")  # replace with actual ID if different
submit_btn.click()

print("Form submitted successfully!")

# ------------------- CLEAN UP -------------------
time.sleep(5)
driver.quit()


Captcha saved as captcha.png. Please open and enter the code.
Form submitted successfully!


In [10]:
# Install dependencies first
# !pip install selenium pandas requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os
import time
import pandas as pd
from datetime import datetime, timedelta
import requests

# ------------------- CONFIG -------------------
BASE_URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
DOWNLOAD_DIR = "judgments_pdfs"
CSV_FILE = "downloaded_judgments.csv"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# ------------------- DATES -------------------
today = datetime.today()
# from_date_str = (today - timedelta(days=10)).strftime("%d/%m/%Y")
# to_date_str = today.strftime("%d/%m/%Y")
from_date_str = ("01/09/2025")
to_date_str = ("11/09/2025")

# ------------------- CHROME OPTIONS -------------------
chrome_options = Options()
chrome_options.add_argument("--headless")  # optional
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(
    service=Service(r"C:\Users\shubham\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"),
    options=chrome_options
)
wait = WebDriverWait(driver, 20)

driver.get(BASE_URL)

# ------------------- FILL FORM -------------------
wait.until(EC.presence_of_element_located((By.ID, "partyFromDate"))).send_keys(from_date_str)
wait.until(EC.presence_of_element_located((By.ID, "partyToDate"))).send_keys(to_date_str)

# ------------------- JUDGE SELECTION -------------------
# leave as default if not needed

# ------------------- REPORTABLE -------------------
driver.find_element(By.ID, "rpjudgeA").click()  # Reportable = YES

# ------------------- CAPTCHA -------------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "captcha")))
captcha_img.screenshot("captcha.png")
captcha_code = input("Enter captcha code from captcha.png: ")
driver.find_element(By.ID, "txtCaptcha").send_keys(captcha_code)

# ------------------- SUBMIT -------------------
driver.find_element(By.ID, "btncasedetail1_1").click()

# ------------------- WAIT FOR RESULTS -------------------
try:
    table = WebDriverWait(driver, 60).until(
        EC.presence_of_element_located((By.ID, "sample_1"))
    )
except:
    print("No results table found. Either no judgments or captcha was incorrect.")
    driver.quit()
    exit()
rows = table.find_elements(By.TAG_NAME, "tr")

# ------------------- LOAD PREVIOUSLY DOWNLOADED -------------------
if os.path.exists(CSV_FILE):
    df_prev = pd.read_csv(CSV_FILE)
    downloaded_ids = set(df_prev['Case Details'].astype(str).tolist())
else:
    df_prev = pd.DataFrame()
    downloaded_ids = set()


data_list = []

for row in rows[1:]:  # skip header
    cols = row.find_elements(By.TAG_NAME, "td")
    judgment_no = cols[0].text.strip()  # assuming first column is unique
    if judgment_no in downloaded_ids:
        continue  # skip already downloaded

    buttons = cols[-1].find_elements(By.TAG_NAME, "button")  # last td is Action
    for btn in buttons:
        if btn.get_attribute("data-ftype") == "PDF":
            caseno = btn.get_attribute("data-caseno")
            orderno = btn.get_attribute("data-orderno")
            cyear = btn.get_attribute("data-cyear")
            break
    pdf_url = f"https://hcraj.nic.in/cishcraj-jdp/DownloadJudgement.aspx?caseno={caseno}&orderno={orderno}&cyear={cyear}&ftype=PDF"

    pdf_filename = f"{judgment_no}.pdf"
    pdf_path = os.path.join(DOWNLOAD_DIR, pdf_filename)

    # download pdf
r = requests.get(pdf_url, stream=True)
with open(pdf_path, "wb") as f:
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)

    row_data = [col.text.strip() for col in cols]
    row_data.append(pdf_filename)  # add PDF filename column
    data_list.append(row_data)

# ------------------- SAVE TO CSV -------------------
columns = [th.text.strip() for th in table.find_elements(By.TAG_NAME, "th")]
columns.append("PDF_File")

df_new = pd.DataFrame(data_list, columns=columns)

if not df_prev.empty:
    df_final = pd.concat([df_prev, df_new], ignore_index=True)
else:
    df_final = df_new

df_final.to_csv(CSV_FILE, index=False)
print(f"Data saved to {CSV_FILE}. PDFs in {DOWNLOAD_DIR}")

driver.quit()


Data saved to downloaded_judgments.csv. PDFs in judgments_pdfs


In [1]:
# Install dependencies first
# !pip install selenium pandas requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os
import time
import pandas as pd
from datetime import datetime, timedelta
import requests

# ------------------- CONFIG -------------------
BASE_URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
DOWNLOAD_DIR = "judgments_pdfs"
CSV_FILE = "downloaded_judgments.csv"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# ------------------- DATES -------------------
today = datetime.today()
# from_date_str = (today - timedelta(days=10)).strftime("%d/%m/%Y")
# to_date_str = today.strftime("%d/%m/%Y")
from_date_str = ("01/09/2025")
to_date_str = ("11/09/2025")

# ------------------- CHROME OPTIONS -------------------
chrome_options = Options()
# chrome_options.add_argument("--headless")  # optional
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(
    service=Service(r"C:\Users\shubham\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"),
    options=chrome_options
)
wait = WebDriverWait(driver, 20)

driver.get(BASE_URL)

# ------------------- FILL FORM -------------------
wait.until(EC.presence_of_element_located((By.ID, "partyFromDate"))).send_keys(from_date_str)
wait.until(EC.presence_of_element_located((By.ID, "partyToDate"))).send_keys(to_date_str)

# ------------------- JUDGE SELECTION -------------------
# leave as default if not needed

# ------------------- REPORTABLE -------------------
driver.find_element(By.ID, "rpjudgeA").click()  # Reportable = YES

# ------------------- CAPTCHA -------------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "captcha")))
captcha_img.screenshot("captcha.png")
captcha_code = input("Enter captcha code from captcha.png: ")
driver.find_element(By.ID, "txtCaptcha").send_keys(captcha_code)

# ------------------- SUBMIT -------------------
driver.find_element(By.ID, "btncasedetail1_1").click()

# ------------------- WAIT FOR RESULTS -------------------
try:
    table = wait.until(EC.presence_of_element_located((By.ID, "sample_1")))
    thead = table.find_element(By.TAG_NAME, "thead")
    headers = thead.find_elements(By.TAG_NAME, "th")
    columns = [th.text.strip() for th in headers]
    columns.append("PDF_File")
except:
    print("No results table found. Either no judgments or captcha was incorrect.")
    driver.quit()
    exit()
rows = table.find_elements(By.TAG_NAME, "tr")

# ------------------- LOAD PREVIOUSLY DOWNLOADED -------------------
if os.path.exists(CSV_FILE):
    df_prev = pd.read_csv(CSV_FILE)
    downloaded_ids = set(df_prev['Case Details'].astype(str).tolist())
else:
    df_prev = pd.DataFrame()
    downloaded_ids = set()


data_list = []

for row in rows[1:]:  # skip header
    cols = row.find_elements(By.TAG_NAME, "td")
    judgment_no = cols[0].text.strip()  # assuming first column is unique
    if judgment_no in downloaded_ids:
        continue  # skip already downloaded

    buttons = cols[-1].find_elements(By.TAG_NAME, "button")  # last td is Action
    download_btn = None
    for btn in buttons:
        if btn.get_attribute("onclick") and "DownloadOrdJud(this,'D')" in btn.get_attribute("onclick"):
            download_btn = btn
            break

    if not download_btn:
        continue  # or log an error

    caseno = download_btn.get_attribute("data-caseno")
    orderno = download_btn.get_attribute("data-orderno")
    cyear = download_btn.get_attribute("data-cyear")
    pdf_url = f"https://hcraj.nic.in/cishcraj-jdp/DownloadJudgement.aspx?caseno={caseno}&orderno={orderno}&cyear={cyear}&ftype=PDF"

    pdf_filename = f"{judgment_no}.pdf"
    pdf_path = os.path.join(DOWNLOAD_DIR, pdf_filename)

    # download pdf
r = requests.get(pdf_url, stream=True)
with open(pdf_path, "wb") as f:
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)

row_data = [col.text.strip() for col in cols]
row_data.append(pdf_filename)  # add PDF filename column
data_list.append(row_data)

# ------------------- SAVE TO CSV -------------------
headers = driver.find_element(By.ID, "sample_1").find_element(By.TAG_NAME, "thead").find_elements(By.TAG_NAME, "th")
columns = [th.text.strip() for th in headers]
columns.append("PDF_File")

df_new = pd.DataFrame(data_list, columns=columns)

if not df_prev.empty:
    df_final = pd.concat([df_prev, df_new], ignore_index=True)
else:
    df_final = df_new

df_final.to_csv(CSV_FILE, index=False)
print(f"Data saved to {CSV_FILE}. PDFs in {DOWNLOAD_DIR}")

driver.quit()


Data saved to downloaded_judgments.csv. PDFs in judgments_pdfs


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os
import pandas as pd
from datetime import datetime

# ------------------- CONFIGURATION -------------------
BASE_URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
DOWNLOAD_DIR = "judgments_pdfs"
CSV_FILE = "downloaded_judgments.csv"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# ------------------- CHROME OPTIONS -------------------
chrome_options = Options()
chrome_options.add_argument("--headless")  # Comment this out to debug visually
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")

driver = webdriver.Chrome(
    service=Service(r"C:\Users\shubham\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"),
    options=chrome_options
)
wait = WebDriverWait(driver, 20)

# ------------------- OPEN PAGE -------------------
driver.get(BASE_URL)

# ------------------- FILL DATE RANGE -------------------
from_date_str = "01/09/2025"
to_date_str = "11/09/2025"
wait.until(EC.presence_of_element_located((By.ID, "partyFromDate"))).send_keys(from_date_str)
wait.until(EC.presence_of_element_located((By.ID, "partyToDate"))).send_keys(to_date_str)

# ------------------- SELECT REPORTABLE -------------------
driver.find_element(By.ID, "rpjudgeA").click()

# ------------------- CAPTCHA -------------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "captcha")))
captcha_img.screenshot("captcha.png")
captcha_code = input("Enter captcha code from captcha.png: ")
driver.find_element(By.ID, "txtCaptcha").send_keys(captcha_code)

# ------------------- SUBMIT FORM -------------------
driver.find_element(By.ID, "btncasedetail1_1").click()

# ------------------- WAIT FOR TABLE -------------------
try:
    table = WebDriverWait(driver, 60).until(
        EC.presence_of_element_located((By.ID, "sample_1"))
    )
    print("Table loaded!")
except:
    print("Table not found. Possibly wrong captcha or no data.")
    driver.quit()
    exit()

# ------------------- EXTRACT DATA -------------------
data_list = []
tbody = table.find_element(By.TAG_NAME, "tbody")
rows = tbody.find_elements(By.TAG_NAME, "tr")

for row in rows:
    cells = row.find_elements(By.TAG_NAME, "td")
    sr_no = cells[0].text.strip()

    # Extract Case Details with <br> handled
    case_details_element = cells[1]
    case_details_parts = case_details_element.get_attribute('innerHTML').split("<br>")
    case_details = ' '.join(part.strip() for part in case_details_parts)

    order_date = cells[2].text.strip()

    # Extract PDF info from buttons
    buttons = cells[3].find_elements(By.TAG_NAME, "button")
    pdf_url = "Not available"
    for btn in buttons:
        if btn.get_attribute("data-ftype") == "PDF" and btn.get_attribute("onclick") and "DownloadOrdJud" in btn.get_attribute("onclick"):
            caseno = btn.get_attribute("data-caseno")
            orderno = btn.get_attribute("data-orderno")
            cyear = btn.get_attribute("data-cyear")
            pdf_url = f"https://hcraj.nic.in/cishcraj-jdp/DownloadJudgement.aspx?caseno={caseno}&orderno={orderno}&cyear={cyear}&ftype=PDF"
            break

    data_list.append({
        "Sr.No.": sr_no,
        "Case Details": case_details,
        "Order/Judgement Date": order_date,
        "PDF URL": pdf_url
    })

# ------------------- SAVE TO CSV -------------------
df = pd.DataFrame(data_list)
df.to_csv(CSV_FILE, index=False)
print(f"Data saved to {CSV_FILE}")

driver.quit()


Table loaded!
Data saved to downloaded_judgments.csv


In [1]:
# Install dependencies first
# pip install selenium pandas requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os
import time
import pandas as pd
from datetime import datetime
import requests

# ------------------- CONFIG -------------------
BASE_URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
DOWNLOAD_DIR = "judgments_pdfs"
CSV_FILE = "downloaded_judgments.csv"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# ------------------- DATES -------------------
today = datetime.today()
from_date_str = "01/09/2025"
to_date_str = "11/09/2025"

# ------------------- CHROME OPTIONS -------------------
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run without GUI
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(
    service=Service(r"C:\Users\shubham\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"),
    options=chrome_options
)
wait = WebDriverWait(driver, 40)

driver.get(BASE_URL)

# ------------------- FILL FORM -------------------
wait.until(EC.presence_of_element_located((By.ID, "partyFromDate"))).send_keys(from_date_str)
wait.until(EC.presence_of_element_located((By.ID, "partyToDate"))).send_keys(to_date_str)

# ------------------- REPORTABLE -------------------
driver.find_element(By.ID, "rpjudgeA").click()  # Reportable = YES

# ------------------- CAPTCHA -------------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "captcha")))
captcha_img.screenshot("captcha.png")
captcha_code = input("Enter captcha code from captcha.png: ")
driver.find_element(By.ID, "txtCaptcha").send_keys(captcha_code)

# ------------------- SUBMIT -------------------
driver.find_element(By.ID, "btncasedetail1_1").click()

# ------------------- SET "All" RECORDS -------------------
try:
    select_element = WebDriverWait(driver, 40).until(
        EC.presence_of_element_located((By.NAME, "sample_1_length"))
    )
    select = Select(select_element)
    select.select_by_value("10")  # Select "All" records
    time.sleep(7)  # wait for table to refresh
except:
    print("Dropdown for setting all records not found or not loaded.")
    driver.quit()
    exit()
# ----- WAIT FOR TABLE -------------------
try:
    table = wait.until(EC.presence_of_element_located((By.ID, "sample_1")))
    rows = table.find_elements(By.TAG_NAME, "tr")
    print(f"Total records found: {len(rows) - 1}")
except:
    print("No results table found. Either no judgments or captcha was incorrect.")
    driver.quit()
    exit()

# ------------------- LOAD PREVIOUSLY DOWNLOADED -------------------
if os.path.exists(CSV_FILE):
    df_prev = pd.read_csv(CSV_FILE)
    downloaded_ids = set(df_prev['Sr.No.'].astype(str).tolist())
else:
    df_prev = pd.DataFrame()
    downloaded_ids = set()

# ------------------- SCRAPE AND DOWNLOAD -------------------
# Wait until at least one PDF download button is present
wait.until(EC.presence_of_element_located(
    (By.XPATH, "//button[contains(@onclick,\"DownloadOrdJud(this,'D')\")]")
))

# Now fetch table rows
table = driver.find_element(By.ID, "sample_1")
rows = table.find_elements(By.TAG_NAME, "tr")
print(f"Total records found: {len(rows) - 1}")

data_list = []

for row in rows[1:]:
    cols = row.find_elements(By.TAG_NAME, "td")
    if len(cols) < 4:
        continue

    sr_no = cols[0].text.strip()
    case_details = cols[1].text.strip()
    order_date = cols[2].text.strip()

    download_btn = None
    # Use XPath on the row itself to find the 'D' button
    try:
        download_btn = row.find_element(
            By.XPATH, ".//button[contains(@onclick,\"DownloadOrdJud(this,'D')\")]"
        )
    except:
        print(f"PDF download button not found for {sr_no}")
        continue
    download_btn = row.find_element(By.XPATH, ".//button[contains(@onclick,\"DownloadOrdJud(this,'D')\")]")

    pdf_filename = f"{sr_no}.pdf"
    pdf_path = os.path.join(DOWNLOAD_DIR, pdf_filename)

    # Directly use requests as before
    pdf_url = f"https://hcraj.nic.in/cishcraj-jdp/DownloadJudgement.aspx?caseno={download_btn.get_attribute('data-caseno')}&orderno={download_btn.get_attribute('data-orderno')}&cyear={download_btn.get_attribute('data-cyear')}&ftype=PDF"

    r = requests.get(pdf_url, stream=True)
    with open(pdf_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)



    print(f"Downloaded: {pdf_filename}")


    row_data = [sr_no, case_details, order_date, pdf_filename]
    data_list.append(row_data)


# ------------------- SAVE TO CSV -------------------
columns = ["Sr.No.", "Case Details", "Order/Judgement Date", "PDF_File"]
df_new = pd.DataFrame(data_list, columns=columns)

if not df_prev.empty:
    df_final = pd.concat([df_prev, df_new], ignore_index=True)
else:
    df_final = df_new

df_final.to_csv(CSV_FILE, index=False)
print(f"Data saved to {CSV_FILE}. PDFs are in {DOWNLOAD_DIR}")

driver.quit()


Total records found: 10
Total records found: 10
Downloaded: 1.pdf
Downloaded: 2.pdf
Downloaded: 3.pdf
Downloaded: 4.pdf
Downloaded: 5.pdf
Downloaded: 6.pdf
Downloaded: 7.pdf
Downloaded: 8.pdf
Downloaded: 9.pdf
Downloaded: 10.pdf
Data saved to downloaded_judgments.csv. PDFs are in judgments_pdfs


In [None]:
# Install dependencies first
# pip install selenium pandas requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os
import time
import pandas as pd
import requests
from datetime import datetime

# ------------------- CONFIG -------------------
BASE_URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
DOWNLOAD_DIR = "judgments_pdfs"
CSV_FILE = "downloaded_judgments.csv"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# ------------------- DATES -------------------
from_date_str = "01/09/2025"
to_date_str = "11/09/2025"

# ------------------- CHROME OPTIONS -------------------
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment to run in background
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(
    service=Service(r"C:\Users\shubham\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"),
    options=chrome_options
)
wait = WebDriverWait(driver, 40)

driver.get(BASE_URL)

# ------------------- FILL FORM -------------------
wait.until(EC.presence_of_element_located((By.ID, "partyFromDate"))).send_keys(from_date_str)
wait.until(EC.presence_of_element_located((By.ID, "partyToDate"))).send_keys(to_date_str)
driver.find_element(By.ID, "rpjudgeA").click()  # Reportable = YES

# ------------------- CAPTCHA -------------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "captcha")))
captcha_img.screenshot("captcha.png")
captcha_code = input("Enter captcha code from captcha.png: ")
driver.find_element(By.ID, "txtCaptcha").send_keys(captcha_code)

# ------------------- SUBMIT -------------------
driver.find_element(By.ID, "btncasedetail1_1").click()

# ------------------- WAIT FOR TABLE AND BUTTONS -------------------
try:
    # Wait until at least one download button is present
    wait.until(EC.presence_of_all_elements_located(
        (By.XPATH, "//button[contains(@onclick,'DownloadOrdJud(this,\\'D\\')')]")
    ))

    table = wait.until(EC.presence_of_element_located((By.ID, "sample_1")))
    rows = table.find_elements(By.TAG_NAME, "tr")
    print(f"Total records found: {len(rows) - 1}")
except:
    print("Table or download buttons not found. Check captcha or page load.")
    driver.quit()
    exit()

# ------------------- LOAD PREVIOUSLY DOWNLOADED -------------------
if os.path.exists(CSV_FILE):
    df_prev = pd.read_csv(CSV_FILE)
    downloaded_ids = set(df_prev['Sr.No.'].astype(str).tolist())
else:
    df_prev = pd.DataFrame()
    downloaded_ids = set()

# ------------------- SCRAPE AND DOWNLOAD -------------------
data_list = []

for row in rows[1:]:  # skip header
    cols = row.find_elements(By.TAG_NAME, "td")
    if len(cols) < 4:
        continue

    sr_no = cols[0].text.strip()
    if sr_no in downloaded_ids:
        continue

    case_details = cols[1].text.strip()
    order_date = cols[2].text.strip()

    download_btn = None
    for btn in cols[3].find_elements(By.TAG_NAME, "button"):
        onclick_attr = btn.get_attribute("onclick") or ""
        if "DownloadOrdJud(this,'D')" in onclick_attr and btn.get_attribute("data-ftype") == "PDF":
            download_btn = btn
            break

    if not download_btn:
        print(f"PDF download button not found for {sr_no}")
        continue

    # Build PDF URL
    caseno = download_btn.get_attribute("data-caseno")
    orderno = download_btn.get_attribute("data-orderno")
    cyear = download_btn.get_attribute("data-cyear")
    pdf_url = f"https://hcraj.nic.in/cishcraj-jdp/DownloadJudgement.aspx?caseno={caseno}&orderno={orderno}&cyear={cyear}&ftype=PDF"

    pdf_filename = f"{sr_no}.pdf"
    pdf_path = os.path.join(DOWNLOAD_DIR, pdf_filename)

    # Download the PDF
    r = requests.get(pdf_url, stream=True)
    if r.status_code == 200:
        with open(pdf_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"Downloaded: {pdf_filename}")
    else:
        print(f"Failed to download {pdf_filename}: HTTP {r.status_code}")
        continue

    row_data = [sr_no, case_details, order_date, pdf_filename]
    data_list.append(row_data)

# ------------------- SAVE TO CSV -------------------
columns = ["Sr.No.", "Case Details", "Order/Judgement Date", "PDF_File"]
df_new = pd.DataFrame(data_list, columns=columns)

if not df_prev.empty:
    df_final = pd.concat([df_prev, df_new], ignore_index=True)
else:
    df_final = df_new

df_final.to_csv(CSV_FILE, index=False)
print(f"Data saved to {CSV_FILE}. PDFs are in {DOWNLOAD_DIR}")

driver.quit()


Table or download buttons not found. Check captcha or page load.


NameError: name 'rows' is not defined

: 

In [4]:
import pandas as pd
df = pd.read_csv("downloaded_judgments.csv")
df.head(20)

Unnamed: 0,Sr.No.,Case Details,Order/Judgement Date,PDF_File
0,1,ARBAP / 4 / 2025\nNEELKANTH FERTILITY AND WOME...,YOGENDRA KUMAR PUROHIT,1.pdf
1,2,CCOMA / 66 / 2011\nRAM LAL Vs. PUNMI DEVI AND ...,YOGENDRA KUMAR PUROHIT,2.pdf
2,3,CFA / 42 / 2017\nVISHVAMITRA AND ORS. Vs. JAYDEV,FARJAND ALI,3.pdf
3,4,CFA / 115 / 2024\nAJMER VIDHYUT VITRAN NIGAM L...,FARJAND ALI,4.pdf
4,5,CFA / 389 / 2024\nLRS OF LATE SHRI BHOLA Vs. S...,FARJAND ALI,5.pdf
5,6,CFA / 717 / 2024\nJYOTI LATA Vs. INDRA,FARJAND ALI,6.pdf
6,7,CFA / 19 / 2025\nMAND KANWAR Vs. STATE OF RAJA...,FARJAND ALI,7.pdf
7,8,CFA / 69 / 2025\nPAPPU RAM Vs. NORATI DEVI,FARJAND ALI,8.pdf
8,9,CFA / 331 / 2025\nMANPREET SINGH Vs. EVERY SPE...,FARJAND ALI,9.pdf
9,10,CFA / 369 / 2025\nRAMPAL Vs. SMT. PREM DEVI,FARJAND ALI,10.pdf


In [None]:
import os
import time
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from webdriver_manager.chrome import ChromeDriverManager

# ---------------- CONFIG ----------------
PDF_DIR = "judgments_pdfs"
CSV_FILE = "judgments_data.csv"
os.makedirs(PDF_DIR, exist_ok=True)
# Get the absolute path for the download directory, which is required by Selenium
PDF_DIR_ABSOLUTE = os.path.abspath(PDF_DIR)
# ---------------------------------------

# --- Chrome options for automated downloading ---
chrome_options = Options()
# chrome_options.add_argument("--headless")  # You can re-enable this after testing
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")

# Preferences to control download behavior
prefs = {
    "download.default_directory": PDF_DIR_ABSOLUTE, # Set your download folder
    "download.prompt_for_download": False, # Disable the "Save As..." dialog
    "plugins.always_open_pdf_externally": True # Download PDF instead of opening in browser
}
chrome_options.add_experimental_option("prefs", prefs)

# Launch Chrome with webdriver-manager (automatic driver)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
print("WebDriver launched successfully.")

# Open Rajasthan HC Judgments page
url = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
driver.get(url)

wait = WebDriverWait(driver, 20)  # wait up to 20 sec for elements

# ---------------- Switch to iframe ----------------
try:
    iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
    driver.switch_to.frame(iframe)
    print("Switched to iframe.")
except Exception as e:
    print(f"Error switching to iframe: {e}")
    driver.quit()
    exit()

# ---------------- Fill the form ----------------
to_date = datetime.today().strftime("%d-%m-%Y")
from_date = (datetime.today() - timedelta(days=10)).strftime("%d-%m-%Y")

from_date_elem = wait.until(EC.presence_of_element_located((By.ID, "txtFromDate")))
to_date_elem = wait.until(EC.presence_of_element_located((By.ID, "txtToDate")))
report_elem = wait.until(EC.presence_of_element_located((By.ID, "ddlReportableJudgment")))

from_date_elem.send_keys(from_date)
to_date_elem.send_keys(to_date)
report_elem.send_keys("YES")
print("Form filled.")

# ---------------- Handle captcha ----------------
captcha_img = wait.until(EC.presence_of_element_located((By.ID, "imgCaptcha")))
captcha_img_path = "captcha.png"
captcha_img.screenshot(captcha_img_path)
Image.open(captcha_img_path).show()
captcha_text = input("Enter Captcha: ")

driver.find_element(By.ID, "txtCaptcha").send_keys(captcha_text)
driver.find_element(By.ID, "btnSubmit").click()
print("Captcha submitted. Waiting for results...")

# ---------------- Scrape table + download PDFs ----------------
try:
    rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table tbody tr")))
    data = []
    print(f"Found {len(rows)} judgments.")

    for r in rows:
        cols = r.find_elements(By.TAG_NAME, "td")
        if not cols or len(cols) < 5:  # Check for valid data rows
            continue

        case_no = cols[0].text.strip()
        case_title = cols[1].text.strip()
        judge = cols[2].text.strip()
        date_judg = cols[3].text.strip()
        
        pdf_link_element = cols[-1].find_element(By.TAG_NAME, "a")
        pdf_link_href = pdf_link_element.get_attribute("href")

        # Sanitize case_no to create a valid filename
        safe_case_no = case_no.replace('/', '_').replace('\\', '_')
        pdf_name = f"{safe_case_no}_{date_judg}.pdf"
        pdf_path = os.path.join(PDF_DIR, pdf_name)

        # Check if the renamed file already exists to avoid re-downloading
        if not os.path.exists(pdf_path):
            files_before = set(os.listdir(PDF_DIR))
            
            # Click the link to trigger the download
            pdf_link_element.click()
            
            # Wait for the download. Increase this if downloads are slow.
            time.sleep(5) 
            
            files_after = set(os.listdir(PDF_DIR))
            
            # Find the new file by comparing directory contents
            new_files = files_after - files_before
            if new_files:
                downloaded_filename = new_files.pop()
                downloaded_filepath = os.path.join(PDF_DIR, downloaded_filename)
                
                # Rename the downloaded file to our desired format
                os.rename(downloaded_filepath, pdf_path)
                print(f"Downloaded and renamed: {pdf_name}")
            else:
                print(f"WARNING: Download may have failed for {case_no}")
        else:
            print(f"Skipped (already exists): {pdf_name}")

        data.append({
            "CaseNo": case_no,
            "CaseTitle": case_title,
            "Judge": judge,
            "Date": date_judg,
            "PDF_Link_JS": pdf_link_href,
            "PDF_Name": pdf_name
        })

except Exception as e:
    print(f"An error occurred during scraping: {e}")
    print("This could be due to an incorrect captcha or no results found.")

# ---------------- Save to CSV incrementally ----------------
if data:
    df_new = pd.DataFrame(data)

    if os.path.exists(CSV_FILE):
        df_old = pd.read_csv(CSV_FILE)
        df_final = pd.concat([df_old, df_new]).drop_duplicates(subset=["CaseNo", "Date"], keep='last')
    else:
        df_final = df_new

    df_final.to_csv(CSV_FILE, index=False)
    print("="*30)
    print("Scraping complete!")
    print(f"Data saved to: {CSV_FILE}")
    print("Sample of newly scraped data:")
    print(df_new.head())
    print("="*30)
else:
    print("No new data was scraped.")

driver.quit()

WebDriver launched successfully.
Error switching to iframe: Message: 
Stacktrace:
	GetHandleVerifier [0x0x11bd2a3+66419]
	GetHandleVerifier [0x0x11bd2e4+66484]
	(No symbol) [0x0xf94bd3]
	(No symbol) [0x0xfde958]
	(No symbol) [0x0xfdecfb]
	(No symbol) [0x0x1025152]
	(No symbol) [0x0x1001064]
	(No symbol) [0x0x10228a1]
	(No symbol) [0x0x1000e16]
	(No symbol) [0x0xfd25ce]
	(No symbol) [0x0xfd34a4]
	GetHandleVerifier [0x0x1405ee3+2461619]
	GetHandleVerifier [0x0x1400f66+2441270]
	GetHandleVerifier [0x0x11e6242+234258]
	GetHandleVerifier [0x0x11d6208+168664]
	GetHandleVerifier [0x0x11dd1ad+197245]
	GetHandleVerifier [0x0x11c55f8+100040]
	GetHandleVerifier [0x0x11c5792+100450]
	GetHandleVerifier [0x0x11af74a+10266]
	BaseThreadInitThunk [0x0x76f25d49+25]
	RtlInitializeExceptionChain [0x0x77d4d6db+107]
	RtlGetAppContainerNamedObjectPath [0x0x77d4d661+561]



MaxRetryError: HTTPConnectionPool(host='localhost', port=52631): Max retries exceeded with url: /session/eeef576d82b73a1f5a8d35326da47ef7/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001B3FF6371F0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

: 