In [None]:
# Install required packages
!pip install selenium pandas opencv-python pillow pytesseract requests

In [None]:
# Install Chrome and ChromeDriver for Colab
!apt-get update
!apt-get install -y chromium-browser chromium-chromedriver
!apt-get install -y tesseract-ocr

In [None]:
# Import the scraper class (copy the main script into a file first)
import sys
import os
from datetime import datetime, timedelta
import pandas as pd

In [None]:
#!/usr/bin/env python3
"""
Rajasthan High Court Judgment Scraper
Incrementally downloads judgments from https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/
"""

import os
import csv
import json
import time
import hashlib
import requests
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
import cv2
import numpy as np
from PIL import Image
import pytesseract
import re

class RajasthanHCJudgmentScraper:
    def __init__(self, download_dir: str = "rajasthan_hc_judgments"):
        self.base_url = "https://hcraj.nic.in/cishcraj-jdp/JudgementFilters/"
        self.download_dir = Path(download_dir)
        self.pdf_dir = self.download_dir / "pdfs"
        self.csv_file = self.download_dir / "judgments.csv"
        self.state_file = self.download_dir / "scraper_state.json"
        
        # Create directories
        self.download_dir.mkdir(exist_ok=True)
        self.pdf_dir.mkdir(exist_ok=True)
        
        # Initialize state
        self.downloaded_judgments = self.load_state()
        
        # Setup Chrome options
        self.chrome_options = Options()
        self.chrome_options.add_argument("--headless")  # Remove for debugging
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.chrome_options.add_experimental_option("prefs", {
            "download.default_directory": str(self.pdf_dir.absolute()),
            "download.prompt_for_download": False,
            "plugins.always_open_pdf_externally": True
        })
        
    def load_state(self) -> Dict:
        """Load previously downloaded judgment IDs and metadata"""
        if self.state_file.exists():
            with open(self.state_file, 'r') as f:
                return json.load(f)
        return {"downloaded_ids": set(), "last_run_date": None}
    
    def save_state(self):
        """Save current state to file"""
        state_to_save = {
            "downloaded_ids": list(self.downloaded_judgments["downloaded_ids"]),
            "last_run_date": self.downloaded_judgments["last_run_date"]
        }
        with open(self.state_file, 'w') as f:
            json.dump(state_to_save, f, indent=2)
    
    def generate_judgment_id(self, judgment_data: Dict) -> str:
        """Generate unique ID for judgment based on key fields"""
        id_string = f"{judgment_data.get('case_number', '')}_{judgment_data.get('judgment_date', '')}_{judgment_data.get('judge_name', '')}"
        return hashlib.md5(id_string.encode()).hexdigest()
    
    def solve_captcha(self, captcha_image_element) -> str:
        """
        Simple captcha solver using OCR
        This is a basic implementation - may need refinement based on captcha complexity
        """
        try:
            # Take screenshot of captcha
            captcha_image_element.screenshot("temp_captcha.png")
            
            # Load and preprocess image
            img = cv2.imread("temp_captcha.png")
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Apply preprocessing to improve OCR accuracy
            # Remove noise
            denoised = cv2.medianBlur(gray, 3)
            
            # Threshold to get binary image
            _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # OCR with specific configuration for captcha
            custom_config = r'--oem 3 --psm 7 -c tesseract_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            captcha_text = pytesseract.image_to_string(thresh, config=custom_config).strip()
            
            # Clean up
            os.remove("temp_captcha.png")
            
            # Basic validation - captchas are usually 4-6 characters
            if len(captcha_text) >= 3 and captcha_text.isalnum():
                return captcha_text
            else:
                return ""
                
        except Exception as e:
            print(f"Error solving captcha: {e}")
            return ""
    
    def setup_driver(self) -> webdriver.Chrome:
        """Initialize Chrome WebDriver"""
        return webdriver.Chrome(options=self.chrome_options)
    
    def fill_form_and_submit(self, driver: webdriver.Chrome, from_date: str, to_date: str, max_retries: int = 3) -> bool:
        """Fill the judgment search form and submit"""
        try:
            # Wait for page to load
            WebDriverWait(driver, 10).wait(
                EC.presence_of_element_located((By.NAME, "fromDate"))
            )
            
            # Fill from date
            from_date_field = driver.find_element(By.NAME, "fromDate")
            from_date_field.clear()
            from_date_field.send_keys(from_date)
            
            # Fill to date
            to_date_field = driver.find_element(By.NAME, "toDate")
            to_date_field.clear()
            to_date_field.send_keys(to_date)
            
            # Set reportable judgment to YES
            try:
                reportable_dropdown = Select(driver.find_element(By.NAME, "reportable"))
                reportable_dropdown.select_by_value("Y")
            except:
                print("Could not find reportable judgment dropdown")
            
            # Handle captcha with retries
            for attempt in range(max_retries):
                try:
                    captcha_img = driver.find_element(By.XPATH, "//img[contains(@src, 'captcha')]")
                    captcha_text = self.solve_captcha(captcha_img)
                    
                    if captcha_text:
                        captcha_field = driver.find_element(By.NAME, "captcha")
                        captcha_field.clear()
                        captcha_field.send_keys(captcha_text)
                        
                        # Submit form
                        submit_btn = driver.find_element(By.XPATH, "//input[@type='submit' or @value='Search']")
                        submit_btn.click()
                        
                        # Check if submission was successful
                        time.sleep(3)
                        if "No records found" not in driver.page_source and "Invalid captcha" not in driver.page_source.lower():
                            return True
                        else:
                            print(f"Captcha attempt {attempt + 1} failed, retrying...")
                            driver.refresh()
                            time.sleep(2)
                    else:
                        print(f"Could not solve captcha, attempt {attempt + 1}")
                        driver.refresh()
                        time.sleep(2)
                        
                except Exception as e:
                    print(f"Error in captcha attempt {attempt + 1}: {e}")
                    driver.refresh()
                    time.sleep(2)
            
            print("Failed to solve captcha after all attempts")
            return False
            
        except Exception as e:
            print(f"Error filling form: {e}")
            return False
    
    def extract_judgment_data(self, driver: webdriver.Chrome) -> List[Dict]:
        """Extract judgment data from results table"""
        judgments = []
        
        try:
            # Wait for results table
            WebDriverWait(driver, 10).wait(
                EC.presence_of_element_located((By.TAG_NAME, "table"))
            )
            
            # Find the results table
            tables = driver.find_elements(By.TAG_NAME, "table")
            results_table = None
            
            for table in tables:
                if "S.No." in table.text or "Case Number" in table.text:
                    results_table = table
                    break
            
            if not results_table:
                print("Could not find results table")
                return judgments
            
            # Extract table headers
            headers = []
            header_row = results_table.find_element(By.TAG_NAME, "tr")
            for th in header_row.find_elements(By.TAG_NAME, "th"):
                headers.append(th.text.strip())
            
            # Extract data rows
            rows = results_table.find_elements(By.TAG_NAME, "tr")[1:]  # Skip header
            
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) < len(headers):
                    continue
                
                judgment_data = {}
                for i, cell in enumerate(cells):
                    if i < len(headers):
                        judgment_data[headers[i]] = cell.text.strip()
                
                # Look for PDF download link
                pdf_links = row.find_elements(By.XPATH, ".//a[contains(@href, '.pdf') or contains(text(), 'View') or contains(text(), 'Download')]")
                if pdf_links:
                    judgment_data['pdf_url'] = pdf_links[0].get_attribute('href')
                else:
                    judgment_data['pdf_url'] = ""
                
                judgments.append(judgment_data)
            
        except Exception as e:
            print(f"Error extracting judgment data: {e}")
        
        return judgments
    
    def download_pdf(self, url: str, filename: str) -> bool:
        """Download PDF from URL"""
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            pdf_path = self.pdf_dir / filename
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
            
            return True
        except Exception as e:
            print(f"Error downloading PDF {filename}: {e}")
            return False
    
    def generate_pdf_filename(self, judgment_data: Dict) -> str:
        """Generate safe filename for PDF"""
        case_num = judgment_data.get('Case Number', 'Unknown').replace('/', '_').replace('\\', '_')
        date = judgment_data.get('Judgment Date', '').replace('/', '_')
        return f"{case_num}_{date}.pdf"
    
    def scrape_judgments(self, from_date: str, to_date: str) -> List[Dict]:
        """Main scraping function"""
        print(f"Scraping judgments from {from_date} to {to_date}")
        
        driver = self.setup_driver()
        all_judgments = []
        
        try:
            driver.get(self.base_url)
            
            if self.fill_form_and_submit(driver, from_date, to_date):
                judgments = self.extract_judgment_data(driver)
                
                for judgment in judgments:
                    judgment_id = self.generate_judgment_id(judgment)
                    
                    # Skip if already downloaded
                    if judgment_id in self.downloaded_judgments["downloaded_ids"]:
                        print(f"Skipping already downloaded judgment: {judgment.get('Case Number', 'Unknown')}")
                        continue
                    
                    # Download PDF if URL exists
                    pdf_filename = ""
                    if judgment.get('pdf_url'):
                        pdf_filename = self.generate_pdf_filename(judgment)
                        if self.download_pdf(judgment['pdf_url'], pdf_filename):
                            judgment['pdf_filename'] = pdf_filename
                            print(f"Downloaded: {pdf_filename}")
                        else:
                            judgment['pdf_filename'] = "Download_Failed"
                    else:
                        judgment['pdf_filename'] = "No_PDF_URL"
                    
                    # Mark as downloaded
                    self.downloaded_judgments["downloaded_ids"].add(judgment_id)
                    all_judgments.append(judgment)
                
        except Exception as e:
            print(f"Error during scraping: {e}")
        
        finally:
            driver.quit()
        
        return all_judgments
    
    def save_to_csv(self, judgments: List[Dict]):
        """Save judgments to CSV file"""
        if not judgments:
            print("No new judgments to save")
            return
        
        # Load existing data if CSV exists
        existing_df = pd.DataFrame()
        if self.csv_file.exists():
            try:
                existing_df = pd.read_csv(self.csv_file)
            except:
                pass
        
        # Create new DataFrame
        new_df = pd.DataFrame(judgments)
        
        # Combine and save
        if not existing_df.empty:
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
        else:
            combined_df = new_df
        
        combined_df.to_csv(self.csv_file, index=False)
        print(f"Saved {len(judgments)} new judgments to {self.csv_file}")
    
    def run_incremental_scrape(self):
        """Run incremental scraping for last 10 days"""
        today = datetime.now()
        from_date_obj = today - timedelta(days=10)
        
        from_date = from_date_obj.strftime("%d/%m/%Y")
        to_date = today.strftime("%d/%m/%Y")
        
        print(f"Running incremental scrape from {from_date} to {to_date}")
        
        judgments = self.scrape_judgments(from_date, to_date)
        
        if judgments:
            self.save_to_csv(judgments)
        
        # Update state
        self.downloaded_judgments["last_run_date"] = today.isoformat()
        self.save_state()
        
        print(f"Scraping completed. Downloaded {len(judgments)} new judgments.")
        return judgments

# Bonus: Supreme Court Captcha Solver
class SCICaptchaSolver:
    def __init__(self):
        pass
    
    def preprocess_captcha_image(self, image_path: str) -> np.ndarray:
        """Preprocess captcha image for better OCR"""
        # Load image
        img = cv2.imread(image_path)
        if img is None:
            img = np.array(Image.open(image_path))
        
        # Convert to grayscale
        if len(img.shape) == 3:
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        else:
            gray = img
        
        # Resize image for better OCR
        height, width = gray.shape
        if height < 50:
            scale_factor = 50 / height
            new_width = int(width * scale_factor)
            gray = cv2.resize(gray, (new_width, 50))
        
        # Apply Gaussian blur to reduce noise
        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
        
        # Apply threshold
        _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        # Morphological operations to clean up
        kernel = np.ones((2, 2), np.uint8)
        cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
        
        return cleaned
    
    def solve_sci_captcha(self, captcha_image_path: str) -> str:
        """Solve Supreme Court captcha"""
        try:
            processed_img = self.preprocess_captcha_image(captcha_image_path)
            
            # OCR configuration for Supreme Court captcha
            custom_config = r'--oem 3 --psm 8 -c tesseract_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            
            # Try OCR
            captcha_text = pytesseract.image_to_string(processed_img, config=custom_config).strip()
            
            # Clean the result
            captcha_text = re.sub(r'[^A-Z0-9]', '', captcha_text.upper())
            
            return captcha_text
            
        except Exception as e:
            print(f"Error solving SCI captcha: {e}")
            return ""

def main():
    """Main function to run the scraper"""
    print("Rajasthan High Court Judgment Scraper")
    print("=" * 50)
    
    # Initialize scraper
    scraper = RajasthanHCJudgmentScraper()
    
    # Run incremental scrape
    try:
        judgments = scraper.run_incremental_scrape()
        print(f"\n✅ Successfully processed {len(judgments)} judgments")
        print(f"📁 Files saved in: {scraper.download_dir}")
        print(f"📄 CSV file: {scraper.csv_file}")
        print(f"📚 PDFs saved in: {scraper.pdf_dir}")
        
    except Exception as e:
        print(f"❌ Error running scraper: {e}")
        
    # Bonus: SCI Captcha solver demo
    print("\n" + "=" * 50)
    print("Supreme Court Captcha Solver (Bonus)")
    print("=" * 50)
    
    sci_solver = SCICaptchaSolver()
    print("SCI Captcha solver initialized. Use sci_solver.solve_sci_captcha('path_to_captcha.png')")

if __name__ == "__main__":
    main()

In [None]:
# 1. Initialize the scraper
print("🚀 Initializing Rajasthan HC Scraper...")
scraper = RajasthanHCJudgmentScraper(download_dir="rajasthan_hc_judgments")


In [None]:
# 2. Run incremental scraping (default: last 10 days)
print("\n📥 Running incremental scrape for last 10 days...")
try:
    judgments = scraper.run_incremental_scrape()
    print(f"✅ Successfully downloaded {len(judgments)} new judgments")
except Exception as e:
    print(f"❌ Error: {e}")

In [None]:
# 3. Display results
if os.path.exists("rajasthan_hc_judgments/judgments.csv"):
    df = pd.read_csv("rajasthan_hc_judgments/judgments.csv")
    print(f"\n📊 Total judgments in database: {len(df)}")
    print("\n📋 Sample data:")
    print(df.head())
    
    # Display statistics
    print("\n📈 Statistics:")
    print(f"• Total judgments: {len(df)}")
    print(f"• PDFs downloaded: {len(df[df['pdf_filename'].str.contains('.pdf', na=False)])}")
    print(f"• Failed downloads: {len(df[df['pdf_filename'] == 'Download_Failed'])}")
    print(f"• No PDF URL: {len(df[df['pdf_filename'] == 'No_PDF_URL'])}")
else:
    print("❌ No CSV file found. Scraping may have failed.")

In [None]:
# 4. Custom date range scraping
print("\n🗓️ Custom date range example:")
from_date = "01/09/2024"  # DD/MM/YYYY format
to_date = "11/09/2024"

print(f"Scraping from {from_date} to {to_date}...")
try:
    custom_judgments = scraper.scrape_judgments(from_date, to_date)
    print(f"✅ Found {len(custom_judgments)} judgments in custom range")
    
    if custom_judgments:
        scraper.save_to_csv(custom_judgments)
        scraper.downloaded_judgments["last_run_date"] = datetime.now().isoformat()
        scraper.save_state()
except Exception as e:
    print(f"❌ Custom scraping error: {e}")

In [None]:
# 5. Bonus: SCI Captcha Solver Demo
print("\n🎯 Supreme Court Captcha Solver (Bonus):")
sci_solver = SCICaptchaSolver()

# Example usage (you would need to provide an actual captcha image)
# captcha_result = sci_solver.solve_sci_captcha("captcha_image.png")
# print(f"Captcha solved: {captcha_result}")

print("\n📁 File structure:")
!ls -la rajasthan_hc_judgments/
print("\n📚 PDFs downloaded:")
!ls -la rajasthan_hc_judgments/pdfs/ | head -10

In [None]:
# 6. Data analysis examples
if os.path.exists("rajasthan_hc_judgments/judgments.csv"):
    df = pd.read_csv("rajasthan_hc_judgments/judgments.csv")
    
    print("\n📊 Data Analysis:")
    
    # Check for date column variations
    date_columns = [col for col in df.columns if 'date' in col.lower()]
    print(f"Date columns found: {date_columns}")
    
    # Show unique values in key columns
    for col in df.columns[:5]:  # First 5 columns
        print(f"\n🔍 Column '{col}' - Unique values: {df[col].nunique()}")
        if df[col].nunique() < 10:
            print(f"   Values: {df[col].unique()[:5]}")

print("\n✅ Notebook execution completed!")
print("\n📖 How to use:")
print("1. The scraper runs incrementally - it remembers what it has downloaded")
print("2. Run the scraper daily to get new judgments")
print("3. All data is saved in 'rajasthan_hc_judgments/' folder")
print("4. CSV contains all metadata, PDFs are in 'pdfs/' subfolder")
print("5. State is tracked in 'scraper_state.json'")

# === TROUBLESHOOTING SECTION ===
print("\n🔧 Troubleshooting Tips:")
print("1. If captcha solving fails, the script will retry 3 times")
print("2. For Colab, make sure to install all dependencies")
print("3. Check internet connection if downloads fail")
print("4. Captcha OCR may need fine-tuning based on actual captcha images")
print("5. The script handles various table formats automatically")

# === MANUAL CAPTCHA FALLBACK ===
def manual_run_with_captcha():
    """
    Manual mode where user can input captcha
    Use this if automated captcha solving doesn't work
    """
    print("\n🔧 Manual captcha mode available - modify the scraper to accept manual input")
    print("Replace the solve_captcha method with manual input for testing")
    
    # Example manual captcha input modification:
    manual_code = """
    def solve_captcha_manual(self, captcha_image_element) -> str:
        captcha_image_element.screenshot("captcha_display.png")
        from IPython.display import Image, display
        display(Image("captcha_display.png"))
        return input("Please enter the captcha: ").strip()
    """
    print("Replace solve_captcha method with manual input if needed:")
    print(manual_code)