In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin, urlparse
import time
from datetime import datetime
import os
from tqdm import tqdm

class AZCourtOrdersScraper:
    def __init__(self):
        self.base_url = "https://orders.azcourts.gov"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.all_orders = []
    
    def get_years_to_scrape(self):
        """Generate list of years from 1956 to current year"""
        current_year = datetime.now().year
        return list(range(1956, current_year + 1))
    
    def construct_year_url(self, year):
        """Construct URL for a specific year's administrative orders"""
        # Two URL patterns based on year:
        # 2015 and before: AdministrativeOrdersIndex/2015AdministrativeOrders.aspx
        # 2016 and after: Administrative-Orders-Index/2016-Administrative-Orders
        
        if year <= 2015:
            # Old .aspx format (2015 and before)
            return f"{self.base_url}/AdministrativeOrdersIndex/{year}AdministrativeOrders.aspx"
        else:
            # New dash format (2016 and after)
            return f"{self.base_url}/Administrative-Orders-Index/{year}-Administrative-Orders"
    
    def try_url_patterns(self, year):
        """Get the URL for the year and try to access it"""
        url = self.construct_year_url(year)
        
        try:
            response = self.session.get(url, timeout=30)
            if response.status_code == 200:
                format_type = ".aspx format" if year <= 2015 else "dash format"
                print(f"DEBUG: Year {year} - SUCCESS ({format_type}): {url}")
                return response, url
            else:
                print(f"DEBUG: Year {year} - HTTP {response.status_code}: {url}")
                return None, None
        except Exception as e:
            print(f"DEBUG: Year {year} - ERROR: {str(e)[:50]}... URL: {url}")
            return None, None
    
    def scrape_year_page(self, year):
        """Scrape administrative orders for a specific year"""
        
        # Try both URL patterns
        response, working_url = self.try_url_patterns(year)
        
        if not response or not working_url:
            print(f"ERROR: No working URL found for year {year}")
            return []
        
        try:
            soup = BeautifulSoup(response.content, 'html.parser')
            orders = []
            
            # DEBUG: Print the page structure to understand what we're parsing
            print(f"\nDEBUG: Successfully accessing {year} - {working_url}")
            
            # Look for different table patterns
            tables = soup.find_all('table')
            print(f"DEBUG: Found {len(tables)} tables")
            
            # Also look for other common containers
            divs_with_tables = soup.find_all('div', class_=lambda x: x and 'table' in x.lower() if x else False)
            print(f"DEBUG: Found {len(divs_with_tables)} divs with table-related classes")
            
            # Look for any structure that might contain orders
            all_links = soup.find_all('a', href=True)
            pdf_links = [link for link in all_links if '.pdf' in link.get('href', '').lower()]
            print(f"DEBUG: Found {len(pdf_links)} PDF links on page")
            
            for table_idx, table in enumerate(tables):
                print(f"DEBUG: Processing table {table_idx + 1}")
                rows = table.find_all('tr')
                print(f"DEBUG: Table has {len(rows)} rows")
                
                # Skip if not enough rows
                if len(rows) < 2:
                    continue
                
                # Check the first few rows to understand structure
                for row_idx, row in enumerate(rows[:3]):
                    cells = row.find_all(['td', 'th'])
                    cell_texts = [cell.get_text().strip()[:50] for cell in cells]  # Limit length for debug
                    print(f"DEBUG: Row {row_idx}: {cell_texts}")
                
                # Look for the correct table structure
                # We expect: Order Number | Description | Date
                header_row = rows[0] if rows else None
                if header_row:
                    header_cells = header_row.find_all(['th', 'td'])
                    header_texts = [cell.get_text().strip().lower() for cell in header_cells]
                    print(f"DEBUG: Header row: {header_texts}")
                    
                    # Check if this looks like an orders table
                    has_order_header = any('order' in h or 'no.' in h for h in header_texts)
                    has_date_header = any('date' in h or 'signed' in h for h in header_texts)
                    has_description_header = any('description' in h or 'title' in h or 'subject' in h for h in header_texts)
                    
                    print(f"DEBUG: Table analysis - Order: {has_order_header}, Date: {has_date_header}, Description: {has_description_header}")
                    
                    # Be more lenient - if it has at least 3 columns, try to process it
                    if len(header_cells) < 3:
                        print(f"DEBUG: Skipping table {table_idx + 1} - less than 3 columns")
                        continue
                
                # Process data rows (skip header)
                data_rows = rows[1:] if len(rows) > 1 else []
                
                if not data_rows:
                    print(f"DEBUG: No data rows in table {table_idx + 1}")
                    continue
                
                with tqdm(total=len(data_rows), 
                         desc=f"Year {year}", 
                         leave=False, 
                         unit="rows",
                         ncols=80,
                         ascii=True,
                         disable=False) as pbar:
                    
                    for row in data_rows:
                        cells = row.find_all(['td', 'th'])
                        
                        if len(cells) >= 3:  # Should have at least 3 columns
                            try:
                                # Extract order number and link
                                order_cell = cells[0]
                                order_link = order_cell.find('a')
                                
                                if order_link:
                                    order_number = order_link.text.strip()
                                    href = order_link.get('href', '')
                                    pdf_link = urljoin(self.base_url, href) if href else ""
                                else:
                                    order_number = order_cell.get_text().strip()
                                    pdf_link = ""
                                
                                # Extract description (try different columns if needed)
                                description = cells[1].get_text().strip()
                                
                                # Extract date (might be in different column)
                                date_signed = ""
                                for i in range(2, len(cells)):
                                    cell_text = cells[i].get_text().strip()
                                    # Look for date patterns
                                    if re.search(r'\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}', cell_text):
                                        date_signed = cell_text
                                        break
                                
                                # If no date found in expected places, use last column
                                if not date_signed and len(cells) > 2:
                                    date_signed = cells[-1].get_text().strip()
                                
                                print(f"DEBUG: Raw data - Order: '{order_number}', Desc: '{description[:30]}...', Date: '{date_signed}'")
                                
                                # VALIDATION: More flexible validation
                                # Skip obviously bad entries
                                if not order_number or len(order_number) > 50:
                                    print(f"DEBUG: Skipping - invalid order number length")
                                    pbar.update(1)
                                    continue
                                
                                if not description or len(description) < 5:
                                    print(f"DEBUG: Skipping - description too short")
                                    pbar.update(1)
                                    continue
                                
                                # Clean up the data
                                order_number = re.sub(r'\s+', ' ', order_number)
                                description = re.sub(r'\s+', ' ', description)
                                date_signed = re.sub(r'\s+', ' ', date_signed)
                                
                                order_data = {
                                    'Order_Number': order_number,
                                    'Administrative_Order_Description': description,
                                    'Date_Signed': date_signed,
                                    'Link_Order': pdf_link,
                                    'Year': year
                                }
                                orders.append(order_data)
                                print(f"DEBUG: ✓ Added order: {order_number}")
                            
                            except Exception as e:
                                print(f"DEBUG: Error processing row in {year}: {e}")
                                continue
                        
                        # Update progress bar
                        pbar.update(1)
                        pbar.set_postfix({'Found': len(orders)})
            
            print(f"DEBUG: Year {year} complete. Found {len(orders)} valid orders")
            return orders
            
        except Exception as e:
            print(f"ERROR: Unexpected error for {year}: {e}")
            return []
    
    def scrape_all_years(self, start_year=1956, end_year=None):
        """Scrape all years from start_year to end_year (or current year)"""
        if end_year is None:
            end_year = datetime.now().year
            
        years = list(range(start_year, end_year + 1))
        total_years = len(years)
        
        print(f"Starting scrape of {total_years} years from {start_year} to {end_year}")
        print("Years with no orders will be skipped automatically.")
        
        # Force flush to ensure print appears
        import sys
        sys.stdout.flush()
        
        # Main progress bar for years with better settings
        with tqdm(total=total_years, 
                 desc="Overall Progress", 
                 unit="year",
                 ncols=100,
                 ascii=True,
                 position=0,
                 leave=True) as main_pbar:
            
            for i, year in enumerate(years):
                main_pbar.set_description(f"Scraping {year}")
                orders = self.scrape_year_page(year)
                
                # Only add orders if we found any
                if orders:
                    self.all_orders.extend(orders)
                    print(f"Year {year}: Found {len(orders)} orders")
                else:
                    print(f"Year {year}: No orders found - skipping")
                
                # Update main progress bar
                main_pbar.update(1)
                main_pbar.set_postfix({
                    'Year': year,
                    'This_Year': len(orders) if orders else 0,
                    'Total_Orders': len(self.all_orders),
                    'Completed': f"{i+1}/{total_years}"
                })
                
                # Print periodic updates for debugging
                if (i + 1) % 5 == 0 or i == 0:
                    print(f"Completed {i+1}/{total_years} years. Total orders: {len(self.all_orders)}")
                    sys.stdout.flush()
                
                # Be respectful with requests
                time.sleep(1)
        
        print(f"\nScraping complete! Total orders scraped: {len(self.all_orders)}")
        return self.all_orders
    
    def save_to_excel(self, filename="az_court_administrative_orders.xlsx"):
        """Save scraped data to Excel file"""
        if not self.all_orders:
            print("No data to save. Run scrape_all_years() first.")
            return
        
        # Progress bar for data processing
        with tqdm(total=4, desc="Processing data") as pbar:
            pbar.set_description("Creating DataFrame")
            df = pd.DataFrame(self.all_orders)
            pbar.update(1)
            
            pbar.set_description("Reordering columns")
            # Reorder columns to match requested format
            column_order = ['Order_Number', 'Administrative_Order_Description', 'Date_Signed', 'Link_Order']
            df = df[column_order + [col for col in df.columns if col not in column_order]]
            pbar.update(1)
            
            pbar.set_description("Sorting data")
            # Sort by year and order number
            df = df.sort_values(['Year', 'Order_Number']) if 'Year' in df.columns else df.sort_values('Order_Number')
            pbar.update(1)
            
            pbar.set_description("Saving to Excel")
            # Save to Excel
            df.to_excel(filename, index=False, engine='openpyxl')
            pbar.update(1)
        
        print(f"Data saved to {filename}")
        print(f"Total records: {len(df)}")
        
        # Print sample of data
        print("\nSample data:")
        print(df.head())
        
        return df

def main():
    """Main function to run the scraper"""
    print("Arizona Court Administrative Orders Scraper")
    print("=" * 50)
    
    scraper = AZCourtOrdersScraper()
    
    # Full historical scrape from 1956 to present
    current_year = datetime.now().year
    total_years = current_year - 1956 + 1
    
    print(f"Starting FULL historical scrape from 1956 to {current_year}...")
    print(f"Checking {total_years} years. Years with no orders will be skipped.")
    print("This may take 30-60 minutes depending on your connection speed.")
    
    print(f"\n🚀 Starting scrape of {total_years} years...")
    print("Progress bars and updates will appear below.")
    
    # Full historical scrape - starts from 1956
    orders = scraper.scrape_all_years(1956)
    
    if orders:
        print(f"\n✅ Scraping completed! Found {len(orders)} total orders.")
        df = scraper.save_to_excel("az_court_orders_complete_1956_2025.xlsx")
        
        # Show comprehensive statistics
        if not df.empty:
            print(f"\n📊 FINAL STATISTICS:")
            print(f"   Years covered: {df['Year'].min()} - {df['Year'].max()}")
            print(f"   Total orders: {len(df):,}")
            print(f"   Years with data: {len(df['Year'].unique())}")
            
            print(f"\n📈 Orders by decade:")
            df['Decade'] = (df['Year'] // 10) * 10
            decade_counts = df['Decade'].value_counts().sort_index()
            for decade, count in decade_counts.items():
                print(f"   {decade}s: {count:,} orders")
            
            print(f"\n📅 Most recent years:")
            recent_years = df['Year'].value_counts().sort_index().tail(10)
            for year, count in recent_years.items():
                print(f"   {year}: {count} orders")
                
            print(f"\n💾 Data saved to: az_court_orders_complete_1956_2025.xlsx")
    else:
        print("❌ No orders were scraped. Please check the URLs and HTML structure.")

if __name__ == "__main__":
    main()

# Additional utility functions for more targeted scraping:

def scrape_single_year(year):
    """Scrape orders for a single year"""
    scraper = AZCourtOrdersScraper()
    orders = scraper.scrape_year_page(year)
    
    if orders:
        scraper.all_orders = orders
        df = scraper.save_to_excel(f"az_court_orders_{year}.xlsx")
        return df
    return None

def scrape_full_range():
    """Scrape the complete range from 1956 to present"""
    print("WARNING: This will scrape 69+ years of data and may take considerable time.")
    response = input("Continue? (y/n): ")
    
    if response.lower() == 'y':
        scraper = AZCourtOrdersScraper()
        print("Starting full historical scrape...")
        orders = scraper.scrape_all_years(1956)
        
        if orders:
            df = scraper.save_to_excel("az_court_orders_complete.xlsx")
            return df
    return None

# Test tqdm functionality
def test_tqdm():
    """Test if tqdm is working properly in your environment"""
    print("Testing tqdm functionality...")
    import sys
    
    # Test 1: Simple progress bar
    print("Test 1: Basic tqdm functionality")
    for i in tqdm(range(10), desc="Basic test", ascii=True, ncols=80):
        time.sleep(0.1)
    print("✓ Basic tqdm test completed")
    
    # Test 2: Check environment
    print(f"Python version: {sys.version}")
    print(f"Running in: {'Jupyter' if 'ipykernel' in sys.modules else 'Terminal'}")
    
    # Test 3: Different tqdm settings
    print("Test 2: tqdm with different settings")
    with tqdm(total=5, desc="Settings test", unit="item", ascii=True) as pbar:
        for i in range(5):
            time.sleep(0.2)
            pbar.update(1)
            pbar.set_postfix({'Item': i+1})
    print("✓ Settings test completed")
    
    print("All tqdm tests passed! Progress bars should work.")

# Quick test function for the scraper
def quick_test():
    """Test scraper with just one recent year"""
    print("Quick test: Scraping just 2025...")
    scraper = AZCourtOrdersScraper()
    orders = scraper.scrape_year_page(2025)
    
    if orders:
        print(f"✓ Successfully found {len(orders)} orders for 2025")
        print("Sample order:", orders[0] if orders else "None")
        return True
    else:
        print("✗ No orders found for 2025")
        return False