In [1]:
!pip install selenium webdriver-manager beautifulsoup4 pandas



In [10]:
!pip install selenium webdriver-manager pandas



In [11]:
pip install selenium pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import re

def create_robust_driver():
    """Create a robust Chrome driver"""
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(executable_path=r"D:/chromedriver-win64/chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

def extract_project_info_from_card_direct(card_element, index):
    """Extract project information directly from the card"""
    
    project_info = {
        'RERA Regd. No': '',
        'Project Name': '',
        'Promoter Name': '',
        'Promoter Address': '',
        'GST No': ''  # Will remain empty as per your instruction
    }
    
    try:
        # Get all text from the card
        card_text = card_element.text
        print(f"\n--- Card {index} Content ---")
        print(card_text)
        print("--- End Card Content ---\n")
        
        # Split the text into lines for easier processing
        lines = [line.strip() for line in card_text.split('\n') if line.strip()]
        
        # Extract RERA number (pattern: RP/XX/XXXX/XXXXX)
        for line in lines:
            rera_match = re.search(r'(RP/\d+/\d+/\d+|RP/\d+/\d+|PS/\d+/\d+)', line)
            if rera_match:
                project_info['RERA Regd. No'] = rera_match.group(1)
                print(f"   🔢 Found RERA: {project_info['RERA Regd. No']}")
                break
        
        # Extract project name (usually the first line or a heading)
        try:
            # Try to find project name in heading tags first
            headings = card_element.find_elements(By.CSS_SELECTOR, "h1, h2, h3, h4, h5, h6")
            for heading in headings:
                heading_text = heading.text.strip()
                if heading_text and not re.match(r'^(RP|PS)/', heading_text):
                    project_info['Project Name'] = heading_text
                    print(f"   📋 Found Project Name (heading): {project_info['Project Name']}")
                    break
            
            # If not found in headings, look for it in the first meaningful line
            if not project_info['Project Name']:
                for line in lines:
                    if (line and 
                        not re.match(r'^(RP|PS)/', line) and 
                        'by ' not in line.lower() and
                        len(line) > 3 and
                        not any(keyword in line.lower() for keyword in ['address', 'project type', 'started from', 'possession'])):
                        project_info['Project Name'] = line
                        print(f"   📋 Found Project Name (text): {project_info['Project Name']}")
                        break
        except Exception as name_error:
            print(f"   ⚠️ Error extracting project name: {str(name_error)}")
        
        # Extract promoter name (look for line starting with "by " or containing company indicators)
        try:
            for line in lines:
                # Look for "by [Company Name]" pattern
                if line.lower().startswith('by '):
                    promoter_name = line[3:].strip()  # Remove "by " prefix
                    if promoter_name:
                        project_info['Promoter Name'] = promoter_name
                        print(f"   🏢 Found Promoter Name: {project_info['Promoter Name']}")
                        break
                
                # Alternative: look for lines with company indicators
                elif any(indicator in line.upper() for indicator in ['PVT', 'LTD', 'PRIVATE', 'LIMITED', 'COMPANY', 'CORP', 'DEVELOPERS', 'BUILDERS']):
                    if not line.lower().startswith('by '):
                        # Check if this line is likely a company name
                        if len(line) > 5 and not re.match(r'^(RP|PS)/', line):
                            project_info['Promoter Name'] = line.strip()
                            print(f"   🏢 Found Promoter Name (alt): {project_info['Promoter Name']}")
                            break
        except Exception as promoter_error:
            print(f"   ⚠️ Error extracting promoter name: {str(promoter_error)}")
        
        # Extract address (look for "Address" label or location names)
        try:
            address_found = False
            for i, line in enumerate(lines):
                # Look for "Address" label
                if line.lower() == 'address' and i + 1 < len(lines):
                    project_info['Promoter Address'] = lines[i + 1].strip()
                    print(f"   📍 Found Address (labeled): {project_info['Promoter Address']}")
                    address_found = True
                    break
            
            # If not found with label, look for location patterns
            if not address_found:
                for line in lines:
                    # Look for lines that seem like addresses (single words that could be cities/locations)
                    if (len(line.split()) <= 3 and  # Short lines (likely city names)
                        len(line) > 2 and
                        not re.match(r'^(RP|PS)/', line) and
                        'by ' not in line.lower() and
                        not any(keyword in line.lower() for keyword in ['project', 'type', 'residential', 'commercial', 'started', 'possession', 'units', 'available']) and
                        not any(indicator in line.upper() for indicator in ['PVT', 'LTD', 'PRIVATE', 'LIMITED'])):
                        
                        project_info['Promoter Address'] = line.strip()
                        print(f"   📍 Found Address (pattern): {project_info['Promoter Address']}")
                        break
        except Exception as address_error:
            print(f"   ⚠️ Error extracting address: {str(address_error)}")
        
        # GST No remains empty as instructed
        project_info['GST No'] = ''
        print(f"   💼 GST No: Left empty as instructed")
        
        return project_info
        
    except Exception as e:
        print(f"   ❌ Error processing card {index}: {str(e)}")
        return project_info

def scrape_rera_from_cards_only():
    """Scrape RERA data directly from project cards without detail page navigation"""
    
    driver = None
    projects_data = []
    
    try:
        print("🚀 Starting RERA scraping from cards only...")
        driver = create_robust_driver()
        
        # Load main page
        driver.get("https://rera.odisha.gov.in/projects/project-list")
        driver.maximize_window()
        time.sleep(10)
        
        print("📄 Page loaded, finding project cards...")
        
        # Find project cards
        project_cards = driver.find_elements(By.CSS_SELECTOR, ".project-card")
        
        if not project_cards:
            print("❌ No project cards found")
            return []
        
        print(f"✅ Found {len(project_cards)} project cards")
        
        # Process first 6 project cards
        for i, card in enumerate(project_cards[:6], 1):
            print(f"\n🔍 Processing Project Card {i}/{min(6, len(project_cards))}")
            
            try:
                project_info = extract_project_info_from_card_direct(card, i)
                
                print(f"\n📊 Project {i} Summary:")
                print(f"   📋 Project Name: {project_info['Project Name']}")
                print(f"   🔢 RERA No: {project_info['RERA Regd. No']}")
                print(f"   🏢 Promoter Name: {project_info['Promoter Name']}")
                print(f"   📍 Promoter Address: {project_info['Promoter Address']}")
                print(f"   💼 GST No: {project_info['GST No'] or 'N/A (as instructed)'}")
                
                projects_data.append(project_info)
                print(f"   ✅ Project {i} processed successfully")
                
            except Exception as card_error:
                print(f"   ❌ Error processing card {i}: {str(card_error)}")
                continue
        
        return projects_data
        
    except Exception as e:
        print(f"❌ Main error: {str(e)}")
        return []
    
    finally:
        if driver:
            try:
                driver.quit()
                print("🔚 Browser closed")
            except:
                pass

# Run the card-only scraper
if __name__ == "__main__":
    projects = scrape_rera_from_cards_only()
    
    if projects:
        df = pd.DataFrame(projects)
        print("\n" + "="*100)
        print("📊 FINAL EXTRACTED PROJECT DATA:")
        print("="*100)
        print(df.to_string(index=False))
        
        # Save to CSV
        df.to_csv('rera_projects_final.csv', index=False)
        print(f"\n💾 Data saved to rera_projects_final.csv")
        
        # Show summary
        filled_names = sum(1 for p in projects if p['Project Name'])
        filled_rera = sum(1 for p in projects if p['RERA Regd. No'])
        filled_promoter = sum(1 for p in projects if p['Promoter Name'])
        filled_address = sum(1 for p in projects if p['Promoter Address'])
        
        print(f"\n📈 Final Extraction Summary:")
        print(f"   📋 Project Names: {filled_names}/6")
        print(f"   🔢 RERA Numbers: {filled_rera}/6")
        print(f"   🏢 Promoter Names: {filled_promoter}/6")
        print(f"   📍 Addresses: {filled_address}/6")
        print(f"   💼 GST Numbers: 0/6 (left empty as instructed)")
        
        print(f"\n✅ Successfully scraped {len(projects)} projects with all available data!")
    else:
        print("❌ No data extracted")


🚀 Starting RERA scraping from cards only...
📄 Page loaded, finding project cards...
✅ Found 10 project cards

🔍 Processing Project Card 1/6

--- Card 1 Content ---
Basanti Enclave
by M/S. NEELACHAL INFRA DEVELOPERS PVT. LTD
Address
Angul
Project Type
Residential
Started From
May, 2025
Possession by
Dec, 2027
86 Units
Available
RP/01/2025/01362
Contact
View Details
--- End Card Content ---

   🔢 Found RERA: RP/01/2025/01362
   📋 Found Project Name (heading): Basanti Enclave
   🏢 Found Promoter Name: M/S. NEELACHAL INFRA DEVELOPERS PVT. LTD
   📍 Found Address (labeled): Angul
   💼 GST No: Left empty as instructed

📊 Project 1 Summary:
   📋 Project Name: Basanti Enclave
   🔢 RERA No: RP/01/2025/01362
   🏢 Promoter Name: M/S. NEELACHAL INFRA DEVELOPERS PVT. LTD
   📍 Promoter Address: Angul
   💼 GST No: N/A (as instructed)
   ✅ Project 1 processed successfully

🔍 Processing Project Card 2/6

--- Card 2 Content ---
UDYAYEEN
by SHYAMCHAND BUILDERS PRIVATE LIMITED
Address
Khordha
Project Type
