# Rajasthan High Court Judgment Scraper
This notebook automates scraping judgments from the Rajasthan HC website, solving captchas with OCR.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time, logging, os, re, requests, pytz
from datetime import datetime, timedelta
import pandas as pd
from PIL import Image
import pytesseract
import base64
from io import BytesIO
import cv2, numpy as np, glob
import bs4

logging.basicConfig(level=logging.INFO)

BASE_URL = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters"
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)
CSV_FILE = os.path.join(OUTPUT_DIR, "judgments.csv")

# Date range (last 10 days)
ist = pytz.timezone('Asia/Kolkata')
current_ist_date = datetime.now(ist)
from_date_obj = current_ist_date - timedelta(days=10)
from_date_str = from_date_obj.strftime("%d/%m/%Y")
to_date_str = current_ist_date.strftime("%d/%m/%Y")

# Existing results
existing_rows = []
if os.path.exists(CSV_FILE):
    try:
        df_existing = pd.read_csv(CSV_FILE)
        existing_pdf_names = set(df_existing['pdf_name'])
        existing_rows = df_existing.to_dict('records')
    except Exception:
        existing_pdf_names = set()
else:
    existing_pdf_names = set()

# WebDriver setup
chrome_options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
wait = WebDriverWait(driver, 20)
driver.get(BASE_URL)
logging.info("Loaded Rajasthan High Court Judgement Filter page.")

In [None]:
# Input date range & reportable judgment
wait.until(EC.presence_of_element_located((By.ID, "partyFromDate"))).clear()
driver.find_element(By.ID, "partyFromDate").send_keys(from_date_str)
wait.until(EC.presence_of_element_located((By.ID, "partyToDate"))).clear()
driver.find_element(By.ID, "partyToDate").send_keys(to_date_str)
driver.find_element(By.ID, "rpjudgeY").click()

In [None]:
# --- OCR & Captcha Solving Utilities ---
MAX_CAPTCHA_RETRIES = 10

def calibrate_hsv_thresholds(sample_dir="captchas/calibration", debug=False):
    sample_paths = glob.glob(f"{sample_dir}/*.*")
    if not sample_paths:
        return np.array([0, 0, 165], dtype=np.uint8), np.array([255, 150, 255], dtype=np.uint8)
    return np.array([0, 0, 165], dtype=np.uint8), np.array([255, 150, 255], dtype=np.uint8)

def solve_captcha_robust(captcha_image, lower_hsv, upper_hsv):
    img_rgb = np.array(captcha_image.convert("RGB"))
    hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV)
    mask = cv2.inRange(hsv, lower_hsv, upper_hsv)
    text = pytesseract.image_to_string(Image.fromarray(mask), config=r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789')
    digits = ''.join(filter(str.isdigit, text))
    return digits if len(digits) == 6 else None

lower_hsv, upper_hsv = calibrate_hsv_thresholds()
print("Using HSV thresholds:", lower_hsv, upper_hsv)

In [None]:
# --- Captcha solving loop ---
attempts_made = 0
while attempts_made < MAX_CAPTCHA_RETRIES:
    try:
        driver.find_element(By.ID, "change-image").click()
    except Exception:
        pass
    time.sleep(2)
    captcha_src = driver.find_element(By.ID, "captcha").get_attribute("src")
    if not captcha_src:
        continue
    if captcha_src.startswith("data:"):
        captcha_base64 = captcha_src.split(",", 1)[1]
        captcha_image = Image.open(BytesIO(base64.b64decode(captcha_base64)))
    else:
        resp = requests.get(captcha_src, timeout=10)
        captcha_image = Image.open(BytesIO(resp.content))
    solved = solve_captcha_robust(captcha_image, lower_hsv, upper_hsv)
    if not solved:
        continue
    attempts_made += 1
    print(f"[Attempt {attempts_made}] Solved captcha: {solved}")
    driver.find_element(By.ID, "txtCaptcha").clear()
    driver.find_element(By.ID, "txtCaptcha").send_keys(solved)
    driver.find_element(By.ID, "btncasedetail1_1").click()
    try:
        wait.until(EC.presence_of_element_located((By.ID, "div_datatable")))
        print("CAPTCHA accepted.")
        break
    except Exception:
        print("CAPTCHA rejected.")
else:
    print("Failed to solve CAPTCHA after retries.")
    driver.quit()
    raise SystemExit

In [None]:
# --- Scrape table and save ---
results_html = driver.page_source
soup = bs4.BeautifulSoup(results_html, "html.parser")
table = soup.find("div", {"id": "div_datatable"})
if table is None:
    print("No judgments found.")
    driver.quit()
    raise SystemExit

table_elem = table.find("table")
headers = [th.get_text(strip=True) for th in table_elem.find_all("th")]
rows = table_elem.find("tbody").find_all("tr")

output_data = []
for row in rows:
    cols = row.find_all("td")
    row_data = [col.get_text(strip=True) for col in cols]
    pdf_name = f"judgment_{row_data[0]}.pdf"
    output_data.append(row_data + [pdf_name])

final_headers = headers + ["pdf_name"]
df_out = pd.DataFrame(output_data, columns=final_headers)
df_out.to_csv(CSV_FILE, index=False)
print(f"Saved results to {CSV_FILE}")
driver.quit()