In [None]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import re

# URLs of the pages to scrape
urls = [
    "https://alphaai.company/top-personal-injury-law-firms-in-ontario/",
    "https://alphaai.company/top-personal-injury-law-firms-in-alberta/",
    "https://alphaai.company/top-personal-injury-law-firms-in-quebec/",
    "https://alphaai.company/top-personal-injury-law-firms-in-british-columbia/",
    "https://alphaai.company/top-personal-injury-law-firms-in-newfoundland-and-labrador/"
]

# Function to extract province from URL
def extract_province(url):
    province_mappings = {
        "ontario": "Ontario",
        "alberta": "Alberta",
        "quebec": "Quebec",
        "british-columbia": "British Columbia",
        "newfoundland-and-labrador": "Newfoundland and Labrador"
    }

    for key, value in province_mappings.items():
        if key in url.lower():
            return value

    return "Unknown"

# Function to scrape a single page
def scrape_page(url):
    try:
        print(f"Attempting to scrape: {url}")

        # Add a delay to be respectful to the server
        time.sleep(2)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()  # Raise an exception for bad status codes
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return None

        print(f"Successfully fetched page content, length: {len(response.text)}")

        soup = BeautifulSoup(response.text, 'html.parser')

        # Get province from URL
        province = extract_province(url)

        # Extract page title (h1)
        title_element = soup.find('div', class_='law-firm-rankings-heading')
        title = title_element.find('h1').text.strip() if title_element and title_element.find('h1') else "Unknown Title"

        # Extract description (paragraph text)
        description_element = soup.find('div', class_='low-firm-rankings-description')
        description = ""
        if description_element and description_element.find_all('p'):
            for p in description_element.find_all('p'):
                if p.text.strip():
                    description += p.text.strip() + " "
        description = description.strip()

        # Extract firm data
        firms = []
        firm_sections = soup.find_all('div', class_='ranked-firm-section')

        print(f"Found {len(firm_sections)} firm sections")

        for idx, section in enumerate(firm_sections, 1):
            try:
                firm_data = {"rank": idx, "province": province}

                # Find the info container
                info_container = section.find('div', class_='law-firm-rankings-info')
                if not info_container:
                    continue

                # Extract firm name and website
                name_div = section.find('div', class_='rankings-info-name')
                if name_div and name_div.find('a'):
                    firm_data['name'] = name_div.find('a').text.strip()
                    firm_data['website'] = name_div.find('a').get('href')
                else:
                    # Try alternative structure
                    links = section.find_all('a')
                    for link in links:
                        if not link.get('href').startswith('http'):
                            continue
                        if "linkedin" not in link.get('href', ''):
                            firm_data['website'] = link.get('href')
                            if not firm_data.get('name') and link.text.strip():
                                firm_data['name'] = link.text.strip()

                # Try to extract monthly visits
                visit_span = section.find('span', class_='rankings-info-monthly-visits')
                if visit_span:
                    visit_text = visit_span.text.strip()
                    # Extract just the visit number (like "25K-50K monthly visits")
                    visit_match = re.search(r'(\d+[KM]?-\d+[KM]?|\d+[KM]?\+?) monthly visits', visit_text)
                    if visit_match:
                        firm_data['monthly_visits'] = visit_match.group(1)
                    else:
                        firm_data['monthly_visits'] = visit_text

                # Try to extract LinkedIn URL
                linkedin_link = section.find('a', href=lambda href: href and 'linkedin.com' in href.lower())
                if linkedin_link:
                    firm_data['linkedin'] = linkedin_link.get('href')

                # If we have at least a name or website, add the firm
                if firm_data.get('name') or firm_data.get('website'):
                    firms.append(firm_data)
                    print(f"Extracted firm: {firm_data.get('name', 'Unknown')}")

            except Exception as e:
                print(f"Error extracting firm data: {e}")
                continue

        return {
            'province': province,
            'title': title,
            'description': description,
            'firms': firms,
            'url': url
        }

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to save data to files
def save_data(all_data):
    # Create a backup of existing files if they exist
    import os
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    for filename in ['law_firms_data.json', 'law_firms_data.csv']:
        if os.path.exists(filename):
            backup_name = f"{filename.split('.')[0]}_{timestamp}.{filename.split('.')[1]}"
            os.rename(filename, backup_name)
            print(f"Created backup of existing file: {backup_name}")

    # Save the data to a JSON file
    with open('law_firms_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    # Create a flattened DataFrame for easier processing
    firms_list = []
    for province_data in all_data:
        if province_data and 'firms' in province_data:
            for firm in province_data['firms']:
                firms_list.append(firm)

    if firms_list:
        df = pd.DataFrame(firms_list)
        df.to_csv('law_firms_data.csv', index=False)
        print(f"Data saved to CSV with {len(df)} records")
    else:
        print("No firm data to save to CSV")

# Main function to scrape all pages
def scrape_all_pages():
    all_data = []

    for url in urls:
        print(f"\n===== Scraping: {url} =====")
        page_data = scrape_page(url)
        if page_data and page_data.get('firms'):
            all_data.append(page_data)
            print(f"Successfully scraped {len(page_data['firms'])} firms for {page_data['province']}")
        else:
            print(f"Failed to scrape data from {url}")

    save_data(all_data)
    return all_data

# Alternative: If the URLs aren't accessible, use local HTML files
def process_local_html(file_path, province_name):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract title
        title_element = soup.find('div', class_='law-firm-rankings-heading')
        title = title_element.find('h1').text.strip() if title_element and title_element.find('h1') else f"Top Personal Injury Law Firms in {province_name}"

        # Extract description
        description_element = soup.find('div', class_='low-firm-rankings-description')
        description = ""
        if description_element:
            description = description_element.text.strip()

        # Extract firms
        firms = []
        firm_sections = soup.find_all('div', class_='ranked-firm-section')

        for idx, section in enumerate(firm_sections, 1):
            try:
                firm_data = {"rank": idx, "province": province_name}

                # Find firm name and website
                name_element = section.find('div', class_='rankings-info-name')
                if name_element and name_element.find('a'):
                    firm_data['name'] = name_element.find('a').text.strip()
                    firm_data['website'] = name_element.find('a').get('href')

                # Find monthly visits
                visits_span = section.find('span', class_='rankings-info-monthly-visits')
                if visits_span:
                    firm_data['monthly_visits'] = visits_span.text.strip()

                # Find LinkedIn
                linkedin = section.find('a', href=lambda href: href and 'linkedin.com' in href)
                if linkedin:
                    firm_data['linkedin'] = linkedin.get('href')

                if firm_data.get('name') or firm_data.get('website'):
                    firms.append(firm_data)
            except Exception as e:
                print(f"Error processing firm: {e}")
                continue

        return {
            'province': province_name,
            'title': title,
            'description': description,
            'firms': firms
        }

    except Exception as e:
        print(f"Error processing local HTML: {e}")
        return None

if __name__ == "__main__":
    # Try scraping from URLs first
    data = scrape_all_pages()

    if not data:
        print("\nURL scraping failed. Processing local HTML files if available...")
        # Define mappings for local HTML files
        local_files = {
            "ontario.html": "Ontario",
            "alberta.html": "Alberta",
            "quebec.html": "Quebec",
            "british_columbia.html": "British Columbia",
            "newfoundland.html": "Newfoundland and Labrador"
        }

        local_data = []
        for file_name, province in local_files.items():
            if os.path.exists(file_name):
                print(f"Processing local file: {file_name}")
                province_data = process_local_html(file_name, province)
                if province_data:
                    local_data.append(province_data)

        if local_data:
            save_data(local_data)
            print(f"Processed {len(local_data)} local HTML files")
        else:
            print("No data could be scraped from URLs or local files")
    else:
        print(f"Successfully scraped data from {len(data)} pages")


===== Scraping: https://alphaai.company/top-personal-injury-law-firms-in-ontario/ =====
Attempting to scrape: https://alphaai.company/top-personal-injury-law-firms-in-ontario/
Successfully fetched page content, length: 231845
Found 15 firm sections
Extracted firm: Siskinds Law Firm
Extracted firm: McLeish Orlando LLP
Extracted firm: Gluckstein LLP
Extracted firm: SG Injury Law
Extracted firm: Diamond and Diamond Lawyers
Extracted firm: Harrison Pensa
Extracted firm: Howie, Sacks & Henry LLP
Extracted firm: Bergeron Clifford Injury Lawyers
Extracted firm: Boland Romaine LLP
Extracted firm: Neinstein Personal Injury Lawyers
Extracted firm: Samfiru Tumarkin LLP
Extracted firm: Iacobelli Law Firm
Extracted firm: David Hollingsworth
Extracted firm: Lalande Personal Injury Lawyers
Extracted firm: Roger R. Foisy & Associates
Successfully scraped 15 firms for Ontario

===== Scraping: https://alphaai.company/top-personal-injury-law-firms-in-alberta/ =====
Attempting to scrape: https://alphaai.