Fixed Rajasthan High Court Judgment Scraper - Testing Notebook


This notebook contains the fixed version of the Rajasthan HC scraper with improved error handling and captcha solving.

Setup and Installation


In [None]:
# Install required packages
!pip install selenium webdriver-manager pandas opencv-python pillow pytesseract requests numpy  

In [1]:
# Install required packages
!pip install selenium pandas opencv-python pillow pytesseract requests




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Import the fixed scraper class
import sys
import os
from datetime import datetime, timedelta
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Copy the fixed scraper code here or import from the Python file
exec(open('rajasthan_hc_scraper_fixed.py').read())

In [2]:
# Install Chrome and ChromeDriver for Colab
!apt-get update
!apt-get install -y chromium-browser chromium-chromedriver
!apt-get install -y tesseract-ocr

'apt-get' is not recognized as an internal or external command,
operable program or batch file.
'apt-get' is not recognized as an internal or external command,
operable program or batch file.
'apt-get' is not recognized as an internal or external command,
operable program or batch file.


Quick Test Run


In [None]:
# Initialize the fixed scraper
print("Initializing Fixed Rajasthan HC Scraper...")
scraper = FixedRajasthanHCScraper(download_dir="test_judgments")

# Test with a smaller date range first (last 3 days)
print("\nRunning test scrape for last 3 days...")
test_judgments = scraper.run_incremental_scrape(days_back=3)

print(f"\nTest completed! Found {len(test_judgments)} judgments")

Check Results


In [None]:
# Display results
if os.path.exists("test_judgments/judgments.csv"):
    df = pd.read_csv("test_judgments/judgments.csv")
    print(f"Total judgments in database: {len(df)}")
    print("\nColumns:", list(df.columns))
    
    if len(df) > 0:
        print("\nSample data:")
        display(df.head())
        
        # Check download statistics
        if 'download_status' in df.columns:
            print("\nDownload Status:")
            print(df['download_status'].value_counts())
else:
    print("No CSV file found. Check if scraping was successful.")

In [None]:
Manual Captcha Testing


# If you need to test captcha solving manually
from IPython.display import Image, display

# Display a captcha image if one was saved
if os.path.exists("manual_captcha.png"):
    print("Captcha image:")
    display(Image("manual_captcha.png"))

File Structure Check

In [None]:
# Check downloaded files
import glob

base_dir = "test_judgments"
if os.path.exists(base_dir):
    print(f"Files in {base_dir}:")
    for file in glob.glob(f"{base_dir}/*"):
        if os.path.isfile(file):
            size = os.path.getsize(file)
            print(f"  📄 {os.path.basename(file)} ({size:,} bytes)")
        else:
            print(f"  📁 {os.path.basename(file)}/")
    
    # Check PDFs
    pdf_dir = f"{base_dir}/pdfs"
    if os.path.exists(pdf_dir):
        pdf_files = glob.glob(f"{pdf_dir}/*.pdf")
        print(f"\nPDFs downloaded ({len(pdf_files)} files):")
        for pdf in pdf_files[:10]:  # Show first 10
            size = os.path.getsize(pdf)
            print(f"  📄 {os.path.basename(pdf)} ({size:,} bytes)")
        if len(pdf_files) > 10:
            print(f"  ... and {len(pdf_files) - 10} more PDFs")

Custom Date Range Testing


In [None]:
# Test with a custom date range
from_date = "01/09/2025"  # DD/MM/YYYY
to_date = "12/09/2025"

print(f"Testing custom date range: {from_date} to {to_date}")
custom_judgments = scraper.scrape_judgments(from_date, to_date)

print(f"Found {len(custom_judgments)} judgments in custom range")

if custom_judgments:
    # Save results
    scraper.save_to_csv(custom_judgments)
    scraper.downloaded_judgments["last_run_date"] = datetime.now().isoformat()
    scraper.save_state()
    print("Custom results saved successfully")

Bonus: SCI Captcha Solver Test


In [None]:
# Initialize and test the Supreme Court captcha solver
print("Initializing SCI Captcha Solver...")
sci_solver = SCICaptchaSolver()

# If you have a test captcha image, uncomment and use:
# test_result = sci_solver.solve_sci_captcha("test_captcha.png")
# print(f"SCI Captcha result: {test_result}")

print("SCI Captcha solver ready for use!")

Troubleshooting 

In [None]:
# Check if all dependencies are working
def test_dependencies():
    try:
        import selenium
        print(f"✅ Selenium: {selenium.__version__}")
    except ImportError:
        print("❌ Selenium not found")
    
    try:
        import cv2
        print(f"✅ OpenCV: {cv2.__version__}")
    except ImportError:
        print("❌ OpenCV not found")
    
    try:
        import pytesseract
        print("✅ Tesseract: Available")
    except ImportError:
        print("⚠️ Tesseract not found (manual captcha input required)")
    
    try:
        from webdriver_manager.chrome import ChromeDriverManager
        print("✅ WebDriver Manager: Available")
    except ImportError:
        print("❌ WebDriver Manager not found")

test_dependencies()

Debug Mode


In [None]:
# Run in debug mode (non-headless) to see what's happening
print("Running in debug mode (browser visible)...")

# Create a debug version of the scraper
debug_scraper = FixedRajasthanHCScraper(download_dir="debug_judgments")

# Modify chrome options to be non-headless
debug_scraper.chrome_options = Options()
debug_scraper.chrome_options.add_argument("--no-sandbox")
debug_scraper.chrome_options.add_argument("--disable-dev-shm-usage")
debug_scraper.chrome_options.add_argument("--window-size=1280,720")

# Run a small test
debug_judgments = debug_scraper.run_incremental_scrape(days_back=1)
print(f"Debug run completed: {len(debug_judgments)} judgments")

Error Recovery


In [None]:
# If the scraper gets stuck, you can reset the state
def reset_scraper_state():
    state_file = "test_judgments/scraper_state.json"
    if os.path.exists(state_file):
        os.remove(state_file)
        print("Scraper state reset")
    else:
        print("No state file found")

# Uncomment to reset:
# reset_scraper_state()

In [3]:
# Import the scraper class (copy the main script into a file first)
import sys
import os
from datetime import datetime, timedelta
import pandas as pd

In [4]:
#!/usr/bin/env python3
"""
Rajasthan High Court Judgment Scraper
Incrementally downloads judgments from https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/
"""

import os
import csv
import json
import time
import hashlib
import requests
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
import cv2
import numpy as np
from PIL import Image
import pytesseract
import re

class RajasthanHCJudgmentScraper:
    def __init__(self, download_dir: str = "rajasthan_hc_judgments"):
        self.base_url = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
        self.download_dir = Path(download_dir)
        self.pdf_dir = self.download_dir / "pdfs"
        self.csv_file = self.download_dir / "judgments.csv"
        self.state_file = self.download_dir / "scraper_state.json"
        
        # Create directories
        self.download_dir.mkdir(exist_ok=True)
        self.pdf_dir.mkdir(exist_ok=True)
        
        # Initialize state
        self.downloaded_judgments = self.load_state()
        
        # Setup Chrome options
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")  # Remove for debugging
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.chrome_options.add_experimental_option("prefs", {
            "download.default_directory": str(self.pdf_dir.absolute()),
            "download.prompt_for_download": False,
            "plugins.always_open_pdf_externally": True
        })
        
    def load_state(self) -> Dict:
        """Load previously downloaded judgment IDs and metadata"""
        if self.state_file.exists():
            with open(self.state_file, 'r') as f:
                return json.load(f)
        return {"downloaded_ids": set(), "last_run_date": None}
    
    def save_state(self):
        """Save current state to file"""
        state_to_save = {
            "downloaded_ids": list(self.downloaded_judgments["downloaded_ids"]),
            "last_run_date": self.downloaded_judgments["last_run_date"]
        }
        with open(self.state_file, 'w') as f:
            json.dump(state_to_save, f, indent=2)
    
    def generate_judgment_id(self, judgment_data: Dict) -> str:
        """Generate unique ID for judgment based on key fields"""
        id_string = f"{judgment_data.get('case_number', '')}_{judgment_data.get('judgment_date', '')}_{judgment_data.get('judge_name', '')}"
        return hashlib.md5(id_string.encode()).hexdigest()
    
    def solve_captcha(self, captcha_image_element) -> str:
        """
        Simple captcha solver using OCR
        This is a basic implementation - may need refinement based on captcha complexity
        """
        try:
            # Take screenshot of captcha
            captcha_image_element.screenshot("temp_captcha.png")
            
            # Load and preprocess image
            img = cv2.imread("temp_captcha.png")
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Apply preprocessing to improve OCR accuracy
            # Remove noise
            denoised = cv2.medianBlur(gray, 3)
            
            # Threshold to get binary image
            _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # OCR with specific configuration for captcha
            custom_config = r'--oem 3 --psm 7 -c tesseract_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            captcha_text = pytesseract.image_to_string(thresh, config=custom_config).strip()
            
            # Clean up
            os.remove("temp_captcha.png")
            
            # Basic validation - captchas are usually 4-6 characters
            if len(captcha_text) >= 3 and captcha_text.isalnum():
                return captcha_text
            else:
                return ""
                
        except Exception as e:
            print(f"Error solving captcha: {e}")
            return ""
    
    def setup_driver(self) -> webdriver.Chrome:
        """Initialize Chrome WebDriver"""
        return webdriver.Chrome(options=self.chrome_options)
    
    def fill_form_and_submit(self, driver: webdriver.Chrome, from_date: str, to_date: str, max_retries: int = 3) -> bool:
        """Fill the judgment search form and submit"""
        try:
            # Wait for page to load
            WebDriverWait(driver, 10).wait(
                EC.presence_of_element_located((By.NAME, "fromDate"))
            )
            
            # Fill from date
            from_date_field = driver.find_element(By.NAME, "fromDate")
            from_date_field.clear()
            from_date_field.send_keys(from_date)
            
            # Fill to date
            to_date_field = driver.find_element(By.NAME, "toDate")
            to_date_field.clear()
            to_date_field.send_keys(to_date)
            
            # Set reportable judgment to YES
            try:
                reportable_dropdown = Select(driver.find_element(By.NAME, "reportable"))
                reportable_dropdown.select_by_value("Y")
            except:
                print("Could not find reportable judgment dropdown")
            
            # Handle captcha with retries
            for attempt in range(max_retries):
                try:
                    captcha_img = driver.find_element(By.XPATH, "//img[contains(@src, 'captcha')]")
                    captcha_text = self.solve_captcha(captcha_img)
                    
                    if captcha_text:
                        captcha_field = driver.find_element(By.NAME, "captcha")
                        captcha_field.clear()
                        captcha_field.send_keys(captcha_text)
                        
                        # Submit form
                        submit_btn = driver.find_element(By.XPATH, "//input[@type='submit' or @value='Search']")
                        submit_btn.click()
                        
                        # Check if submission was successful
                        time.sleep(3)
                        if "No records found" not in driver.page_source and "Invalid captcha" not in driver.page_source.lower():
                            return True
                        else:
                            print(f"Captcha attempt {attempt + 1} failed, retrying...")
                            driver.refresh()
                            time.sleep(2)
                    else:
                        print(f"Could not solve captcha, attempt {attempt + 1}")
                        driver.refresh()
                        time.sleep(2)
                        
                except Exception as e:
                    print(f"Error in captcha attempt {attempt + 1}: {e}")
                    driver.refresh()
                    time.sleep(2)
            
            print("Failed to solve captcha after all attempts")
            return False
            
        except Exception as e:
            print(f"Error filling form: {e}")
            return False
    
    def extract_judgment_data(self, driver: webdriver.Chrome) -> List[Dict]:
        """Extract judgment data from results table"""
        judgments = []
        
        try:
            # Wait for results table
            WebDriverWait(driver, 10).wait(
                EC.presence_of_element_located((By.TAG_NAME, "table"))
            )
            
            # Find the results table
            tables = driver.find_elements(By.TAG_NAME, "table")
            results_table = None
            
            for table in tables:
                if "S.No." in table.text or "Case Number" in table.text:
                    results_table = table
                    break
            
            if not results_table:
                print("Could not find results table")
                return judgments
            
            # Extract table headers
            headers = []
            header_row = results_table.find_element(By.TAG_NAME, "tr")
            for th in header_row.find_elements(By.TAG_NAME, "th"):
                headers.append(th.text.strip())
            
            # Extract data rows
            rows = results_table.find_elements(By.TAG_NAME, "tr")[1:]  # Skip header
            
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) < len(headers):
                    continue
                
                judgment_data = {}
                for i, cell in enumerate(cells):
                    if i < len(headers):
                        judgment_data[headers[i]] = cell.text.strip()
                
                # Look for PDF download link
                pdf_links = row.find_elements(By.XPATH, ".//a[contains(@href, '.pdf') or contains(text(), 'View') or contains(text(), 'Download')]")
                if pdf_links:
                    judgment_data['pdf_url'] = pdf_links[0].get_attribute('href')
                else:
                    judgment_data['pdf_url'] = ""
                
                judgments.append(judgment_data)
            
        except Exception as e:
            print(f"Error extracting judgment data: {e}")
        
        return judgments
    
    def download_pdf(self, url: str, filename: str) -> bool:
        """Download PDF from URL"""
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            pdf_path = self.pdf_dir / filename
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
            
            return True
        except Exception as e:
            print(f"Error downloading PDF {filename}: {e}")
            return False
    
    def generate_pdf_filename(self, judgment_data: Dict) -> str:
        """Generate safe filename for PDF"""
        case_num = judgment_data.get('Case Number', 'Unknown').replace('/', '_').replace('\\', '_')
        date = judgment_data.get('Judgment Date', '').replace('/', '_')
        return f"{case_num}_{date}.pdf"
    
    def scrape_judgments(self, from_date: str, to_date: str) -> List[Dict]:
        """Main scraping function"""
        print(f"Scraping judgments from {from_date} to {to_date}")
        
        driver = self.setup_driver()
        all_judgments = []
        
        try:
            driver.get(self.base_url)
            
            if self.fill_form_and_submit(driver, from_date, to_date):
                judgments = self.extract_judgment_data(driver)
                
                for judgment in judgments:
                    judgment_id = self.generate_judgment_id(judgment)
                    
                    # Skip if already downloaded
                    if judgment_id in self.downloaded_judgments["downloaded_ids"]:
                        print(f"Skipping already downloaded judgment: {judgment.get('Case Number', 'Unknown')}")
                        continue
                    
                    # Download PDF if URL exists
                    pdf_filename = ""
                    if judgment.get('pdf_url'):
                        pdf_filename = self.generate_pdf_filename(judgment)
                        if self.download_pdf(judgment['pdf_url'], pdf_filename):
                            judgment['pdf_filename'] = pdf_filename
                            print(f"Downloaded: {pdf_filename}")
                        else:
                            judgment['pdf_filename'] = "Download_Failed"
                    else:
                        judgment['pdf_filename'] = "No_PDF_URL"
                    
                    # Mark as downloaded
                    self.downloaded_judgments["downloaded_ids"].add(judgment_id)
                    all_judgments.append(judgment)
                
        except Exception as e:
            print(f"Error during scraping: {e}")
        
        finally:
            driver.quit()
        
        return all_judgments
    
    def save_to_csv(self, judgments: List[Dict]):
        """Save judgments to CSV file"""
        if not judgments:
            print("No new judgments to save")
            return
        
        # Load existing data if CSV exists
        existing_df = pd.DataFrame()
        if self.csv_file.exists():
            try:
                existing_df = pd.read_csv(self.csv_file)
            except:
                pass
        
        # Create new DataFrame
        new_df = pd.DataFrame(judgments)
        
        # Combine and save
        if not existing_df.empty:
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
        else:
            combined_df = new_df
        
        combined_df.to_csv(self.csv_file, index=False)
        print(f"Saved {len(judgments)} new judgments to {self.csv_file}")
    
    def run_incremental_scrape(self):
        """Run incremental scraping for last 10 days"""
        today = datetime.now()
        from_date_obj = today - timedelta(days=10)
        
        from_date = from_date_obj.strftime("%d/%m/%Y")
        to_date = today.strftime("%d/%m/%Y")
        
        print(f"Running incremental scrape from {from_date} to {to_date}")
        
        judgments = self.scrape_judgments(from_date, to_date)
        
        if judgments:
            self.save_to_csv(judgments)
        
        # Update state
        self.downloaded_judgments["last_run_date"] = today.isoformat()
        self.save_state()
        
        print(f"Scraping completed. Downloaded {len(judgments)} new judgments.")
        return judgments

# Bonus: Supreme Court Captcha Solver
class SCICaptchaSolver:
    def __init__(self):
        pass
    
    def preprocess_captcha_image(self, image_path: str) -> np.ndarray:
        """Preprocess captcha image for better OCR"""
        # Load image
        img = cv2.imread(image_path)
        if img is None:
            img = np.array(Image.open(image_path))
        
        # Convert to grayscale
        if len(img.shape) == 3:
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        else:
            gray = img
        
        # Resize image for better OCR
        height, width = gray.shape
        if height < 50:
            scale_factor = 50 / height
            new_width = int(width * scale_factor)
            gray = cv2.resize(gray, (new_width, 50))
        
        # Apply Gaussian blur to reduce noise
        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
        
        # Apply threshold
        _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        # Morphological operations to clean up
        kernel = np.ones((2, 2), np.uint8)
        cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
        
        return cleaned
    
    def solve_sci_captcha(self, captcha_image_path: str) -> str:
        """Solve Supreme Court captcha"""
        try:
            processed_img = self.preprocess_captcha_image(captcha_image_path)
            
            # OCR configuration for Supreme Court captcha
            custom_config = r'--oem 3 --psm 8 -c tesseract_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            
            # Try OCR
            captcha_text = pytesseract.image_to_string(processed_img, config=custom_config).strip()
            
            # Clean the result
            captcha_text = re.sub(r'[^A-Z0-9]', '', captcha_text.upper())
            
            return captcha_text
            
        except Exception as e:
            print(f"Error solving SCI captcha: {e}")
            return ""

def main():
    """Main function to run the scraper"""
    print("Rajasthan High Court Judgment Scraper")
    print("=" * 50)
    
    # Initialize scraper
    scraper = RajasthanHCJudgmentScraper()
    
    # Run incremental scrape
    try:
        judgments = scraper.run_incremental_scrape()
        print(f"\n✅ Successfully processed {len(judgments)} judgments")
        print(f"📁 Files saved in: {scraper.download_dir}")
        print(f"📄 CSV file: {scraper.csv_file}")
        print(f"📚 PDFs saved in: {scraper.pdf_dir}")
        
    except Exception as e:
        print(f"❌ Error running scraper: {e}")
        
    # Bonus: SCI Captcha solver demo
    print("\n" + "=" * 50)
    print("Supreme Court Captcha Solver (Bonus)")
    print("=" * 50)
    
    sci_solver = SCICaptchaSolver()
    print("SCI Captcha solver initialized. Use sci_solver.solve_sci_captcha('path_to_captcha.png')")

if __name__ == "__main__":
    main()

Rajasthan High Court Judgment Scraper
Running incremental scrape from 01/09/2025 to 11/09/2025
Scraping judgments from 01/09/2025 to 11/09/2025
Error filling form: 'WebDriverWait' object has no attribute 'wait'
Scraping completed. Downloaded 0 new judgments.

✅ Successfully processed 0 judgments
📁 Files saved in: rajasthan_hc_judgments
📄 CSV file: rajasthan_hc_judgments\judgments.csv
📚 PDFs saved in: rajasthan_hc_judgments\pdfs

Supreme Court Captcha Solver (Bonus)
SCI Captcha solver initialized. Use sci_solver.solve_sci_captcha('path_to_captcha.png')


In [5]:
# 1. Initialize the scraper
print("🚀 Initializing Rajasthan HC Scraper...")
scraper = RajasthanHCJudgmentScraper(download_dir="rajasthan_hc_judgments")


🚀 Initializing Rajasthan HC Scraper...


In [6]:
# 2. Run incremental scraping (default: last 10 days)
print("\n📥 Running incremental scrape for last 10 days...")
try:
    judgments = scraper.run_incremental_scrape()
    print(f"✅ Successfully downloaded {len(judgments)} new judgments")
except Exception as e:
    print(f"❌ Error: {e}")


📥 Running incremental scrape for last 10 days...
Running incremental scrape from 01/09/2025 to 11/09/2025
Scraping judgments from 01/09/2025 to 11/09/2025
Error filling form: 'WebDriverWait' object has no attribute 'wait'
Scraping completed. Downloaded 0 new judgments.
✅ Successfully downloaded 0 new judgments


In [7]:
# 3. Display results
if os.path.exists("rajasthan_hc_judgments/judgments.csv"):
    df = pd.read_csv("rajasthan_hc_judgments/judgments.csv")
    print(f"\n📊 Total judgments in database: {len(df)}")
    print("\n📋 Sample data:")
    print(df.head())
    
    # Display statistics
    print("\n📈 Statistics:")
    print(f"• Total judgments: {len(df)}")
    print(f"• PDFs downloaded: {len(df[df['pdf_filename'].str.contains('.pdf', na=False)])}")
    print(f"• Failed downloads: {len(df[df['pdf_filename'] == 'Download_Failed'])}")
    print(f"• No PDF URL: {len(df[df['pdf_filename'] == 'No_PDF_URL'])}")
else:
    print("❌ No CSV file found. Scraping may have failed.")

❌ No CSV file found. Scraping may have failed.


In [8]:
# 4. Custom date range scraping
print("\n🗓️ Custom date range example:")
from_date = "01/09/2024"  # DD/MM/YYYY format
to_date = "11/09/2024"

print(f"Scraping from {from_date} to {to_date}...")
try:
    custom_judgments = scraper.scrape_judgments(from_date, to_date)
    print(f"✅ Found {len(custom_judgments)} judgments in custom range")
    
    if custom_judgments:
        scraper.save_to_csv(custom_judgments)
        scraper.downloaded_judgments["last_run_date"] = datetime.now().isoformat()
        scraper.save_state()
except Exception as e:
    print(f"❌ Custom scraping error: {e}")


🗓️ Custom date range example:
Scraping from 01/09/2024 to 11/09/2024...
Scraping judgments from 01/09/2024 to 11/09/2024
Error filling form: 'WebDriverWait' object has no attribute 'wait'
✅ Found 0 judgments in custom range


In [9]:
# 5. Bonus: SCI Captcha Solver Demo
print("\n🎯 Supreme Court Captcha Solver (Bonus):")
sci_solver = SCICaptchaSolver()

# Example usage (you would need to provide an actual captcha image)
# captcha_result = sci_solver.solve_sci_captcha("captcha_image.png")
# print(f"Captcha solved: {captcha_result}")

print("\n📁 File structure:")
!ls -la rajasthan_hc_judgments/
print("\n📚 PDFs downloaded:")
!ls -la rajasthan_hc_judgments/pdfs/ | head -10


🎯 Supreme Court Captcha Solver (Bonus):

📁 File structure:

📚 PDFs downloaded:


'ls' is not recognized as an internal or external command,
operable program or batch file.
'ls' is not recognized as an internal or external command,
operable program or batch file.


In [10]:
# 6. Data analysis examples
if os.path.exists("rajasthan_hc_judgments/judgments.csv"):
    df = pd.read_csv("rajasthan_hc_judgments/judgments.csv")
    
    print("\n📊 Data Analysis:")
    
    # Check for date column variations
    date_columns = [col for col in df.columns if 'date' in col.lower()]
    print(f"Date columns found: {date_columns}")
    
    # Show unique values in key columns
    for col in df.columns[:5]:  # First 5 columns
        print(f"\n🔍 Column '{col}' - Unique values: {df[col].nunique()}")
        if df[col].nunique() < 10:
            print(f"   Values: {df[col].unique()[:5]}")

print("\n✅ Notebook execution completed!")
print("\n📖 How to use:")
print("1. The scraper runs incrementally - it remembers what it has downloaded")
print("2. Run the scraper daily to get new judgments")
print("3. All data is saved in 'rajasthan_hc_judgments/' folder")
print("4. CSV contains all metadata, PDFs are in 'pdfs/' subfolder")
print("5. State is tracked in 'scraper_state.json'")

# === TROUBLESHOOTING SECTION ===
print("\n🔧 Troubleshooting Tips:")
print("1. If captcha solving fails, the script will retry 3 times")
print("2. For Colab, make sure to install all dependencies")
print("3. Check internet connection if downloads fail")
print("4. Captcha OCR may need fine-tuning based on actual captcha images")
print("5. The script handles various table formats automatically")

# === MANUAL CAPTCHA FALLBACK ===
def manual_run_with_captcha():
    """
    Manual mode where user can input captcha
    Use this if automated captcha solving doesn't work
    """
    print("\n🔧 Manual captcha mode available - modify the scraper to accept manual input")
    print("Replace the solve_captcha method with manual input for testing")
    
    # Example manual captcha input modification:
    manual_code = """
    def solve_captcha_manual(self, captcha_image_element) -> str:
        captcha_image_element.screenshot("captcha_display.png")
        from IPython.display import Image, display
        display(Image("captcha_display.png"))
        return input("Please enter the captcha: ").strip()
    """
    print("Replace solve_captcha method with manual input if needed:")
    print(manual_code)


✅ Notebook execution completed!

📖 How to use:
1. The scraper runs incrementally - it remembers what it has downloaded
2. Run the scraper daily to get new judgments
3. All data is saved in 'rajasthan_hc_judgments/' folder
4. CSV contains all metadata, PDFs are in 'pdfs/' subfolder
5. State is tracked in 'scraper_state.json'

🔧 Troubleshooting Tips:
1. If captcha solving fails, the script will retry 3 times
2. For Colab, make sure to install all dependencies
3. Check internet connection if downloads fail
4. Captcha OCR may need fine-tuning based on actual captcha images
5. The script handles various table formats automatically


In [11]:
# Windows-Compatible Rajasthan High Court Scraper
# Fixed version for Windows environment

# First, install required packages for Windows
import subprocess
import sys
import os

def install_windows_dependencies():
    """Install dependencies for Windows"""
    packages = [
        'selenium>=4.15.0',
        'pandas>=1.5.0',
        'opencv-python>=4.8.0',
        'Pillow>=10.0.0',
        'pytesseract>=0.3.10',
        'requests>=2.31.0',
        'numpy>=1.24.0',
        'webdriver-manager'  # This will auto-manage ChromeDriver
    ]
    
    for package in packages:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# Uncomment the line below if you need to install dependencies
# install_windows_dependencies()

import os
import csv
import json
import time
import hashlib
import requests
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
import cv2
import numpy as np
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Try to import pytesseract with error handling
try:
    import pytesseract
    # For Windows, you might need to set the tesseract path
    # Uncomment and modify the path below if needed
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    TESSERACT_AVAILABLE = True
except ImportError:
    print("⚠️ Tesseract not available. Captcha solving will be disabled.")
    TESSERACT_AVAILABLE = False

class RajasthanHCJudgmentScraperWindows:
    def __init__(self, download_dir: str = "rajasthan_hc_judgments"):
        self.base_url = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
        self.download_dir = Path(download_dir)
        self.pdf_dir = self.download_dir / "pdfs"
        self.csv_file = self.download_dir / "judgments.csv"
        self.state_file = self.download_dir / "scraper_state.json"
        
        # Create directories
        self.download_dir.mkdir(exist_ok=True)
        self.pdf_dir.mkdir(exist_ok=True)
        
        # Initialize state
        self.downloaded_judgments = self.load_state()
        
        # Setup Chrome options for Windows
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")  # Comment this out to see browser
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--window-size=1920,1080")
        
        # Set download preferences
        prefs = {
            "download.default_directory": str(self.pdf_dir.absolute()),
            "download.prompt_for_download": False,
            "plugins.always_open_pdf_externally": True,
            "profile.default_content_settings.popups": 0
        }
        self.chrome_options.add_experimental_option("prefs", prefs)
        
    def load_state(self) -> Dict:
        """Load previously downloaded judgment IDs and metadata"""
        if self.state_file.exists():
            try:
                with open(self.state_file, 'r') as f:
                    state = json.load(f)
                    # Convert list back to set
                    state["downloaded_ids"] = set(state.get("downloaded_ids", []))
                    return state
            except:
                pass
        return {"downloaded_ids": set(), "last_run_date": None}
    
    def save_state(self):
        """Save current state to file"""
        state_to_save = {
            "downloaded_ids": list(self.downloaded_judgments["downloaded_ids"]),
            "last_run_date": self.downloaded_judgments["last_run_date"]
        }
        with open(self.state_file, 'w') as f:
            json.dump(state_to_save, f, indent=2)
    
    def generate_judgment_id(self, judgment_data: Dict) -> str:
        """Generate unique ID for judgment based on key fields"""
        # Use first few keys from the judgment data
        keys = list(judgment_data.keys())[:3]  # First 3 columns
        id_parts = []
        for key in keys:
            value = str(judgment_data.get(key, ''))
            id_parts.append(value)
        
        id_string = "_".join(id_parts)
        return hashlib.md5(id_string.encode()).hexdigest()
    
    def solve_captcha_manual(self, driver) -> str:
        """Manual captcha input (fallback when OCR fails)"""
        try:
            # Try to find and save captcha image
            captcha_imgs = driver.find_elements(By.XPATH, "//img[contains(@src, 'captcha') or contains(@src, 'Captcha')]")
            if captcha_imgs:
                captcha_imgs[0].screenshot("captcha_temp.png")
                print("🖼️ Captcha image saved as 'captcha_temp.png'")
                print("📖 Please open the image and enter the captcha text.")
            
            captcha_text = input("Enter captcha text (or press Enter to skip): ").strip()
            return captcha_text
        except Exception as e:
            print(f"Error in manual captcha: {e}")
            return ""
    
    def solve_captcha(self, driver) -> str:
        """
        Solve captcha - try OCR first, fallback to manual
        """
        if not TESSERACT_AVAILABLE:
            return self.solve_captcha_manual(driver)
        
        try:
            # Find captcha image
            captcha_imgs = driver.find_elements(By.XPATH, "//img[contains(@src, 'captcha') or contains(@src, 'Captcha')]")
            if not captcha_imgs:
                print("No captcha image found")
                return ""
            
            # Save captcha image
            captcha_imgs[0].screenshot("temp_captcha.png")
            
            # Load and preprocess image
            img = cv2.imread("temp_captcha.png")
            if img is None:
                return self.solve_captcha_manual(driver)
            
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Apply preprocessing to improve OCR accuracy
            denoised = cv2.medianBlur(gray, 3)
            _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # OCR with specific configuration for captcha
            custom_config = r'--oem 3 --psm 7 -c tesseract_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            captcha_text = pytesseract.image_to_string(thresh, config=custom_config).strip()
            
            # Clean up temp file
            if os.path.exists("temp_captcha.png"):
                os.remove("temp_captcha.png")
            
            # Basic validation
            if len(captcha_text) >= 3 and captcha_text.replace(' ', '').isalnum():
                print(f"🤖 OCR solved captcha: {captcha_text}")
                return captcha_text.replace(' ', '')
            else:
                print("🤖 OCR failed, trying manual input...")
                return self.solve_captcha_manual(driver)
                
        except Exception as e:
            print(f"Error solving captcha: {e}")
            return self.solve_captcha_manual(driver)
    
    def setup_driver(self) -> webdriver.Chrome:
        """Initialize Chrome WebDriver for Windows"""
        try:
            # Use ChromeDriverManager to automatically handle driver
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=self.chrome_options)
            return driver
        except Exception as e:
            print(f"Error setting up Chrome driver: {e}")
            print("Please ensure Chrome browser is installed")
            raise
    
    def fill_form_and_submit(self, driver: webdriver.Chrome, from_date: str, to_date: str, max_retries: int = 3) -> bool:
        """Fill the judgment search form and submit"""
        try:
            # Wait for page to load - FIXED: Use until() instead of wait()
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.NAME, "fromDate")))
            
            # Fill from date
            from_date_field = driver.find_element(By.NAME, "fromDate")
            from_date_field.clear()
            from_date_field.send_keys(from_date)
            
            # Fill to date  
            to_date_field = driver.find_element(By.NAME, "toDate")
            to_date_field.clear()
            to_date_field.send_keys(to_date)
            
            # Set reportable judgment to YES
            try:
                reportable_dropdown = Select(driver.find_element(By.NAME, "reportable"))
                reportable_dropdown.select_by_value("Y")
            except Exception as e:
                print(f"Could not find reportable judgment dropdown: {e}")
            
            # Handle captcha with retries
            for attempt in range(max_retries):
                try:
                    print(f"🔍 Captcha attempt {attempt + 1}/{max_retries}")
                    
                    captcha_text = self.solve_captcha(driver)
                    
                    if captcha_text:
                        # Find captcha input field
                        captcha_fields = driver.find_elements(By.XPATH, "//input[contains(@name, 'captcha') or contains(@name, 'Captcha')]")
                        if not captcha_fields:
                            captcha_fields = driver.find_elements(By.XPATH, "//input[@type='text'][last()]")
                        
                        if captcha_fields:
                            captcha_field = captcha_fields[0]
                            captcha_field.clear()
                            captcha_field.send_keys(captcha_text)
                            
                            # Submit form
                            submit_btns = driver.find_elements(By.XPATH, "//input[@type='submit' or @value='Search' or @value='Submit']")
                            if submit_btns:
                                submit_btns[0].click()
                                
                                # Wait and check if submission was successful
                                time.sleep(3)
                                page_source = driver.page_source.lower()
                                
                                if "no records found" not in page_source and "invalid captcha" not in page_source:
                                    print("✅ Form submitted successfully!")
                                    return True
                                else:
                                    print(f"❌ Captcha attempt {attempt + 1} failed, retrying...")
                            else:
                                print("Could not find submit button")
                        else:
                            print("Could not find captcha input field")
                    else:
                        print(f"❌ Could not solve captcha, attempt {attempt + 1}")
                    
                    # Refresh page for retry
                    if attempt < max_retries - 1:
                        driver.refresh()
                        time.sleep(2)
                        wait.until(EC.presence_of_element_located((By.NAME, "fromDate")))
                        
                        # Refill the form
                        from_date_field = driver.find_element(By.NAME, "fromDate")
                        from_date_field.clear()
                        from_date_field.send_keys(from_date)
                        
                        to_date_field = driver.find_element(By.NAME, "toDate")
                        to_date_field.clear()
                        to_date_field.send_keys(to_date)
                        
                        try:
                            reportable_dropdown = Select(driver.find_element(By.NAME, "reportable"))
                            reportable_dropdown.select_by_value("Y")
                        except:
                            pass
                        
                except Exception as e:
                    print(f"Error in captcha attempt {attempt + 1}: {e}")
                    if attempt < max_retries - 1:
                        driver.refresh()
                        time.sleep(2)
            
            print("❌ Failed to solve captcha after all attempts")
            return False
            
        except Exception as e:
            print(f"Error filling form: {e}")
            return False
    
    def extract_judgment_data(self, driver: webdriver.Chrome) -> List[Dict]:
        """Extract judgment data from results table"""
        judgments = []
        
        try:
            # Wait for results table
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            # Find the results table
            tables = driver.find_elements(By.TAG_NAME, "table")
            results_table = None
            
            for table in tables:
                table_text = table.text.lower()
                if any(keyword in table_text for keyword in ["s.no", "case", "judgment", "date"]):
                    results_table = table
                    break
            
            if not results_table:
                print("Could not find results table")
                return judgments
            
            # Extract table headers
            headers = []
            header_rows = results_table.find_elements(By.TAG_NAME, "tr")
            if header_rows:
                header_cells = header_rows[0].find_elements(By.TAG_NAME, "th")
                if not header_cells:  # Try td if th not found
                    header_cells = header_rows[0].find_elements(By.TAG_NAME, "td")
                
                for cell in header_cells:
                    headers.append(cell.text.strip())
            
            if not headers:
                headers = [f"Column_{i+1}" for i in range(10)]  # Default headers
            
            print(f"📊 Found table with headers: {headers}")
            
            # Extract data rows
            rows = results_table.find_elements(By.TAG_NAME, "tr")[1:]  # Skip header
            
            for i, row in enumerate(rows):
                try:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    if len(cells) == 0:
                        continue
                    
                    judgment_data = {}
                    for j, cell in enumerate(cells):
                        header_name = headers[j] if j < len(headers) else f"Column_{j+1}"
                        judgment_data[header_name] = cell.text.strip()
                    
                    # Look for PDF download link
                    pdf_links = row.find_elements(By.XPATH, ".//a[contains(@href, '.pdf') or contains(text(), 'View') or contains(text(), 'Download')]")
                    if pdf_links:
                        judgment_data['pdf_url'] = pdf_links[0].get_attribute('href')
                    else:
                        judgment_data['pdf_url'] = ""
                    
                    if judgment_data:  # Only add if we got some data
                        judgments.append(judgment_data)
                        
                except Exception as e:
                    print(f"Error processing row {i}: {e}")
                    continue
            
            print(f"📋 Extracted {len(judgments)} judgment records")
            
        except Exception as e:
            print(f"Error extracting judgment data: {e}")
        
        return judgments
    
    def download_pdf(self, url: str, filename: str) -> bool:
        """Download PDF from URL"""
        try:
            print(f"📥 Downloading: {filename}")
            response = requests.get(url, timeout=30, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            })
            response.raise_for_status()
            
            pdf_path = self.pdf_dir / filename
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
            
            return True
        except Exception as e:
            print(f"❌ Error downloading PDF {filename}: {e}")
            return False
    
    def generate_pdf_filename(self, judgment_data: Dict) -> str:
        """Generate safe filename for PDF"""
        # Try to find case number in any column
        case_num = "Unknown"
        date_str = "Unknown"
        
        for key, value in judgment_data.items():
            if "case" in key.lower() and value:
                case_num = str(value).replace('/', '_').replace('\\', '_')[:50]
            elif "date" in key.lower() and value:
                date_str = str(value).replace('/', '_').replace('-', '_')[:20]
        
        # Create safe filename
        safe_filename = f"{case_num}_{date_str}.pdf"
        safe_filename = "".join(c for c in safe_filename if c.isalnum() or c in "._-")
        
        return safe_filename
    
    def scrape_judgments(self, from_date: str, to_date: str) -> List[Dict]:
        """Main scraping function"""
        print(f"🔍 Scraping judgments from {from_date} to {to_date}")
        
        try:
            driver = self.setup_driver()
        except Exception as e:
            print(f"❌ Failed to setup driver: {e}")
            return []
        
        all_judgments = []
        
        try:
            print("🌐 Loading website...")
            driver.get(self.base_url)
            
            if self.fill_form_and_submit(driver, from_date, to_date):
                print("📊 Extracting judgment data...")
                judgments = self.extract_judgment_data(driver)
                
                for judgment in judgments:
                    judgment_id = self.generate_judgment_id(judgment)
                    
                    # Skip if already downloaded
                    if judgment_id in self.downloaded_judgments["downloaded_ids"]:
                        case_ref = list(judgment.values())[0] if judgment else "Unknown"
                        print(f"⏭️ Skipping already downloaded: {case_ref}")
                        continue
                    
                    # Download PDF if URL exists
                    pdf_filename = ""
                    if judgment.get('pdf_url'):
                        pdf_filename = self.generate_pdf_filename(judgment)
                        if self.download_pdf(judgment['pdf_url'], pdf_filename):
                            judgment['pdf_filename'] = pdf_filename
                            print(f"✅ Downloaded: {pdf_filename}")
                        else:
                            judgment['pdf_filename'] = "Download_Failed"
                    else:
                        judgment['pdf_filename'] = "No_PDF_URL"
                    
                    # Mark as downloaded
                    self.downloaded_judgments["downloaded_ids"].add(judgment_id)
                    all_judgments.append(judgment)
                
            else:
                print("❌ Failed to submit form")
        
        except Exception as e:
            print(f"❌ Error during scraping: {e}")
        
        finally:
            try:
                driver.quit()
            except:
                pass
        
        return all_judgments
    
    def save_to_csv(self, judgments: List[Dict]):
        """Save judgments to CSV file"""
        if not judgments:
            print("ℹ️ No new judgments to save")
            return
        
        # Load existing data if CSV exists
        existing_df = pd.DataFrame()
        if self.csv_file.exists():
            try:
                existing_df = pd.read_csv(self.csv_file)
                print(f"📂 Loaded {len(existing_df)} existing records")
            except Exception as e:
                print(f"Warning: Could not load existing CSV: {e}")
        
        # Create new DataFrame
        new_df = pd.DataFrame(judgments)
        
        # Combine and save
        if not existing_df.empty:
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
        else:
            combined_df = new_df
        
        combined_df.to_csv(self.csv_file, index=False)
        print(f"💾 Saved {len(judgments)} new judgments to {self.csv_file}")
    
    def run_incremental_scrape(self):
        """Run incremental scraping for last 10 days"""
        today = datetime.now()
        from_date_obj = today - timedelta(days=10)
        
        from_date = from_date_obj.strftime("%d/%m/%Y")
        to_date = today.strftime("%d/%m/%Y")
        
        print(f"📅 Running incremental scrape from {from_date} to {to_date}")
        
        judgments = self.scrape_judgments(from_date, to_date)
        
        if judgments:
            self.save_to_csv(judgments)
        
        # Update state
        self.downloaded_judgments["last_run_date"] = today.isoformat()
        self.save_state()
        
        print(f"✅ Scraping completed. Downloaded {len(judgments)} new judgments.")
        return judgments

# Windows-specific helper functions
def show_file_structure():
    """Show file structure using Windows commands"""
    import glob
    
    base_dir = "rajasthan_hc_judgments"
    if os.path.exists(base_dir):
        print(f"\n📁 Files in {base_dir}:")
        for file in glob.glob(f"{base_dir}/*"):
            size = os.path.getsize(file) if os.path.isfile(file) else 0
            file_type = "📄" if os.path.isfile(file) else "📁"
            print(f"  {file_type} {os.path.basename(file)} ({size} bytes)")
        
        pdf_dir = f"{base_dir}/pdfs"
        if os.path.exists(pdf_dir):
            pdf_files = glob.glob(f"{pdf_dir}/*.pdf")
            print(f"\n📚 PDFs downloaded ({len(pdf_files)} files):")
            for pdf in pdf_files[:5]:  # Show first 5
                size = os.path.getsize(pdf)
                print(f"  📄 {os.path.basename(pdf)} ({size} bytes)")
            if len(pdf_files) > 5:
                print(f"  ... and {len(pdf_files) - 5} more PDFs")

# Main execution
def main():
    """Main function for Windows"""
    print("🏛️ Rajasthan High Court Judgment Scraper (Windows Version)")
    print("=" * 60)
    
    # Check if tesseract is available
    if not TESSERACT_AVAILABLE:
        print("⚠️ Note: Tesseract OCR not found. You'll need to enter captchas manually.")
        print("📥 To install Tesseract: https://github.com/UB-Mannheim/tesseract/wiki")
    
    try:
        # Initialize scraper
        print("🚀 Initializing scraper...")
        scraper = RajasthanHCJudgmentScraperWindows()
        
        # Run incremental scrape
        print("📥 Running incremental scrape for last 10 days...")
        judgments = scraper.run_incremental_scrape()
        
        if judgments:
            print(f"✅ Successfully processed {len(judgments)} judgments")
            
            # Show results
            if scraper.csv_file.exists():
                df = pd.read_csv(scraper.csv_file)
                print(f"\n📊 Total judgments in database: {len(df)}")
                print("\n📋 Column names:", list(df.columns))
                if len(df) > 0:
                    print("\n🔍 Sample data:")
                    print(df.head(2).to_string())
        else:
            print("ℹ️ No new judgments found")
        
        # Show file structure
        show_file_structure()
        
    except Exception as e:
        print(f"❌ Error: {e}")
        print("\n🔧 Troubleshooting tips:")
        print("1. Make sure Chrome browser is installed")
        print("2. Check your internet connection")
        print("3. The website might be temporarily unavailable")
        print("4. Try running with manual captcha input")

if __name__ == "__main__":
    main()

🏛️ Rajasthan High Court Judgment Scraper (Windows Version)
🚀 Initializing scraper...
📥 Running incremental scrape for last 10 days...
📅 Running incremental scrape from 01/09/2025 to 11/09/2025
🔍 Scraping judgments from 01/09/2025 to 11/09/2025
🌐 Loading website...
Error filling form: Message: 
Stacktrace:
	GetHandleVerifier [0x0xa2d2a3+66419]
	GetHandleVerifier [0x0xa2d2e4+66484]
	(No symbol) [0x0x804bd3]
	(No symbol) [0x0x84e958]
	(No symbol) [0x0x84ecfb]
	(No symbol) [0x0x895152]
	(No symbol) [0x0x871064]
	(No symbol) [0x0x8928a1]
	(No symbol) [0x0x870e16]
	(No symbol) [0x0x8425ce]
	(No symbol) [0x0x8434a4]
	GetHandleVerifier [0x0xc75ee3+2461619]
	GetHandleVerifier [0x0xc70f66+2441270]
	GetHandleVerifier [0x0xa56242+234258]
	GetHandleVerifier [0x0xa46208+168664]
	GetHandleVerifier [0x0xa4d1ad+197245]
	GetHandleVerifier [0x0xa355f8+100040]
	GetHandleVerifier [0x0xa35792+100450]
	GetHandleVerifier [0x0xa1f74a+10266]
	BaseThreadInitThunk [0x0x778bfcc9+25]
	RtlGetAppContainerNamedObjectP