In [4]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 1. SETUP: Configure the URL and Browser
url = "https://ajguk-my.sharepoint.com/:u:/g/personal/francesco_zovi_gallagherre_com/IQCIJ1mzZEu4TKuxkowIoFyKAV4-ambuXM3UFTEiEtqNwbQ?e=1kyerG"  # Replace with the actual URL
driver = webdriver.Chrome()      # Or webdriver.Firefox(), etc.

try:
    # 2. LOAD PAGE
    driver.get(url)
    
    # 3. WAIT: Allow time for JavaScript to render the chart
    # We wait specifically for the 'scattergeo' class shown in your screenshot
    print("Waiting for chart to load...")
    WebDriverWait(driver, 40).until(
        EC.presence_of_element_located((By.CLASS_NAME, "scattergeo"))
    )
    # Optional: small sleep to ensure data binding is complete
    time.sleep(3)

    # 4. EXTRACT DATA via JavaScript
    # Plotly graphs usually live in a div with class 'js-plotly-plot' 
    # and have a .data property attached to the DOM element.
    script = """
    var graph = document.getElementsByClassName('js-plotly-plot')[0];
    if (graph && graph.data) {
        return graph.data;
    } else {
        return null;
    }
    """
    chart_data = driver.execute_script(script)

    if not chart_data:
        print("Could not find Plotly data object. The chart might not be standard Plotly.")
    else:
        # 5. PROCESS DATA
        # chart_data is a list of 'traces' (groups shown in the legend)
        # Each trace contains lists of lats, lons, and text.
        
        all_points = []
        
        for trace in chart_data:
            # Get the group name (e.g., "Poliambulatori", "RSA")
            group_name = trace.get('name', 'Unknown')
            
            # Extract coordinates and text labels
            lats = trace.get('lat', [])
            lons = trace.get('lon', [])
            texts = trace.get('text', []) # Sometimes calls 'hovertext'
            
            # If text is empty, check hovertext
            if not texts:
                texts = trace.get('hovertext', [])
            
            # If still empty, fill with blanks
            if not texts:
                texts = [""] * len(lats)

            # Zip them together into rows
            for lat, lon, txt in zip(lats, lons, texts):
                all_points.append({
                    "Group": group_name,
                    "Latitude": lat,
                    "Longitude": lon,
                    "Description": txt
                })

        # 6. SAVE TO CSV
        if all_points:
            df = pd.DataFrame(all_points)
            df.to_csv("map_data.csv", index=False)
            print(f"Successfully scraped {len(df)} locations to 'map_data.csv'.")
            print(df.head())
        else:
            print("Found the chart object, but it contained no coordinate data.")

finally:
    driver.quit()

Waiting for chart to load...


TimeoutException: Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7770788e5
	0x7ff777078940
	0x7ff776e5165d
	0x7ff776ea9a33
	0x7ff776ea9d3c
	0x7ff776efdf67
	0x7ff776efac97
	0x7ff776e9ac29
	0x7ff776e9ba93
	0x7ff777390640
	0x7ff77738af80
	0x7ff7773a96e6
	0x7ff777095de4
	0x7ff77709ed8c
	0x7ff777082004
	0x7ff7770821b5
	0x7ff777067ee2
	0x7ffbd2c0259d
	0x7ffbd34caf78


In [5]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

# 1. SETUP
url = "https://ajguk-my.sharepoint.com/:u:/g/personal/francesco_zovi_gallagherre_com/IQCIJ1mzZEu4TKuxkowIoFyKAV4-ambuXM3UFTEiEtqNwbQ?e=1kyerG"  # Replace with the actual URL
options = webdriver.ChromeOptions()
# options.add_argument("--headless") # Keep this commented out so you can see it working
driver = webdriver.Chrome(options=options)
driver.maximize_window() # Crucial: Ensure map renders

def search_frames_for_plotly(driver, depth=0):
    """
    Recursively searches current frame and all child iframes for Plotly data.
    """
    # Attempt to extract data from the current frame using JS
    # We look for the standard Plotly class OR the global Plotly object registry
    script = """
    // Strategy A: Look for the DOM element
    var graph = document.getElementsByClassName('js-plotly-plot')[0];
    if (graph && graph.data) return graph.data;

    // Strategy B: Look for the internal Plotly registry (often works if class names change)
    if (window.Plotly && window.Plotly.d3) {
        var plots = window.Plotly.d3.selectAll('.js-plotly-plot').data();
        if (plots.length > 0 && plots[0].length > 0) return plots[0];
    }
    
    return null;
    """
    
    try:
        data = driver.execute_script(script)
        if data:
            print(f"Found data at depth {depth}!")
            return data
    except:
        pass # Ignore JS errors in specific frames

    # If not found, look for child iframes and recurse
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    
    for i, frame in enumerate(iframes):
        # Switch to frame
        try:
            driver.switch_to.frame(frame)
            # Recurse
            result = search_frames_for_plotly(driver, depth + 1)
            if result:
                return result
            # Switch back to parent to continue loop
            driver.switch_to.parent_frame()
        except:
            # If a frame is locked (cross-origin) or closes, skip it
            driver.switch_to.parent_frame()
            continue
            
    return None

try:
    print(f"Loading {url}...")
    driver.get(url)
    
    # Give it plenty of time to render everything
    print("Waiting 10 seconds for map to render...")
    time.sleep(10)
    
    # Take a screenshot for debugging
    driver.save_screenshot("debug_view.png")
    print("Screenshot saved to 'debug_view.png'. Please check if the map is visible in this image.")

    # Start the hunt
    print("Starting recursive search for Plotly data...")
    chart_data = search_frames_for_plotly(driver)

    if not chart_data:
        print("FAILED: Could not find Plotly data in any frame.")
        print("Check 'debug_view.png'. If the map is there, it might be using Shadow DOM or Canvas.")
    else:
        print("Data object found! Processing...")
        
        all_points = []
        for trace in chart_data:
            # Check if this trace has coordinate data
            if 'lat' in trace and 'lon' in trace:
                name = trace.get('name', 'Unknown')
                lats = trace.get('lat', [])
                lons = trace.get('lon', [])
                
                # Normalize text fields
                texts = trace.get('text', [])
                if not texts: texts = trace.get('hovertext', [])
                
                # Ensure text is a list of the correct length
                if isinstance(texts, str):
                    texts = [texts] * len(lats)
                elif not texts:
                    texts = [""] * len(lats)

                for lat, lon, txt in zip(lats, lons, texts):
                    all_points.append({
                        "Group": name,
                        "Latitude": lat,
                        "Longitude": lon,
                        "Description": txt
                    })

        if len(all_points) > 0:
            df = pd.DataFrame(all_points)
            df.to_csv("earthquake_map_data.csv", index=False)
            print("-" * 30)
            print(f"SUCCESS: Saved {len(df)} rows to 'earthquake_map_data.csv'")
            print("-" * 30)
            print(df.head())
        else:
            print("Found the chart object, but it contained no Lat/Lon data.")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()

Loading https://ajguk-my.sharepoint.com/:u:/g/personal/francesco_zovi_gallagherre_com/IQCIJ1mzZEu4TKuxkowIoFyKAV4-ambuXM3UFTEiEtqNwbQ?e=1kyerG...
Waiting 10 seconds for map to render...
Screenshot saved to 'debug_view.png'. Please check if the map is visible in this image.
Starting recursive search for Plotly data...
FAILED: Could not find Plotly data in any frame.
Check 'debug_view.png'. If the map is there, it might be using Shadow DOM or Canvas.


In [6]:
import json
import re
import pandas as pd
from bs4 import BeautifulSoup

# ==========================================
# CONFIGURATION
# ==========================================
# Replace this with the path to the file you downloaded
file_path = r"C:\Users\rajoshi\Downloads\1_grafico_interattivo_regioni_reali.html" 
# ==========================================

def extract_plotly_data_from_html(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            html_content = f.read()
    except UnicodeDecodeError:
        # Fallback if utf-8 fails
        with open(filepath, 'r', encoding='latin-1') as f:
            html_content = f.read()

    print(f"File loaded: {len(html_content)} characters.")

    # METHOD 1: Look for data inside <script type="application/json">
    # This is common in R/HTMLWidgets/Folium exports
    soup = BeautifulSoup(html_content, 'html.parser')
    json_scripts = soup.find_all('script', type='application/json')
    
    for script in json_scripts:
        try:
            data = json.loads(script.string)
            # Sometimes the data is wrapped in an 'x' object
            if 'x' in data and 'data' in data['x']:
                print("Found data in JSON script tag!")
                return data['x']['data']
        except:
            continue

    # METHOD 2: Look for raw JavaScript "Plotly.newPlot" call
    # This matches your snippet style
    print("Searching for JavaScript data pattern...")
    
    # Regex explanation:
    # 1. Look for 'Plotly.newPlot('
    # 2. Skip the div ID (first argument)
    # 3. Capture everything inside the second argument [...] (the data)
    # 4. Stop when we hit the comma before the layout object {,
    pattern = r"Plotly\.newPlot\(\s*['\"].*?['\"]\s*,\s*(\[[\s\S]*?\])\s*,\s*\{"
    
    match = re.search(pattern, html_content)
    
    if match:
        json_str = match.group(1)
        try:
            # Clean up potential JavaScript weirdness to make it valid JSON
            # Sometimes JS uses ' instead of " which JSON hates
            # This is a basic cleanup, might not catch everything
            return json.loads(json_str)
        except json.JSONDecodeError:
            print("Found the data block, but it contains raw JavaScript that isn't valid JSON.")
            print("Attempting to fix quotes...")
            try:
                # specific fix for JS objects using keys without quotes
                # This is risky but often works for simple Plotly dumps
                return json.loads(json_str.replace("'", '"'))
            except:
                return None
    
    return None

# ==========================================
# EXECUTION
# ==========================================
print(f"Reading {file_path}...")
chart_data = extract_plotly_data_from_html(file_path)

if not chart_data:
    print("Could not extract data. The file format might be unique.")
else:
    all_points = []
    print(f"Found {len(chart_data)} trace(s). Processing...")

    for trace in chart_data:
        # Check for lat/lon keys
        if 'lat' in trace and 'lon' in trace:
            # Name of the group (e.g., Poliambulatori)
            group_name = trace.get('name', 'Unknown')
            
            lats = trace.get('lat', [])
            lons = trace.get('lon', [])
            
            # Text extraction logic
            texts = trace.get('text', [])
            if not texts: 
                texts = trace.get('hovertext', [])
            
            # Normalize list lengths
            if isinstance(texts, str):
                texts = [texts] * len(lats)
            elif not texts:
                texts = [""] * len(lats)

            for lat, lon, txt in zip(lats, lons, texts):
                all_points.append({
                    "Group": group_name,
                    "Latitude": lat,
                    "Longitude": lon,
                    "Description": txt
                })

    if all_points:
        df = pd.DataFrame(all_points)
        output_filename = "extracted_map_data.csv"
        df.to_csv(output_filename, index=False)
        print("="*40)
        print(f"SUCCESS! Scraped {len(df)} locations.")
        print(f"Saved to: {output_filename}")
        print("="*40)
        print(df.head())
    else:
        print("Found Plotly structure, but no geographic data (lat/lon) inside.")

Reading C:\Users\rajoshi\Downloads\1_grafico_interattivo_regioni_reali.html...
File loaded: 7709490 characters.
Searching for JavaScript data pattern...
Found 428 trace(s). Processing...
SUCCESS! Scraped 74940 locations.
Saved to: extracted_map_data.csv
                            Group Latitude Longitude  \
0                 Poliambulatori     dtype     dtype   
1                 Poliambulatori     bdata     bdata   
2             Studi Odontoiatrici    dtype     dtype   
3             Studi Odontoiatrici    bdata     bdata   
4  Poliambulatorio con chirurgia     dtype     dtype   

                      Description  
0                 Poliambulatori   
1                 Poliambulatori   
2             Studi Odontoiatrici  
3             Studi Odontoiatrici  
4  Poliambulatorio con chirurgia   


In [7]:
import json
import re
import base64
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# ==========================================
# CONFIGURATION
# ==========================================
file_path = r"C:\Users\rajoshi\Downloads\1_grafico_interattivo_regioni_reali.html" 
# ==========================================

def decode_bdata(data_obj):
    """
    Decodes Plotly binary data (base64) into a standard Python list.
    """
    # If it's just a normal list, return it as is
    if isinstance(data_obj, list):
        return data_obj
    
    # If it's a dictionary containing 'bdata', decode it
    if isinstance(data_obj, dict) and 'bdata' in data_obj and 'dtype' in data_obj:
        try:
            decoded_bytes = base64.b64decode(data_obj['bdata'])
            dtype = data_obj['dtype']
            
            # Convert binary to numpy array based on type
            # Common types: 'float64', 'float32', 'int32', 'uint8'
            return np.frombuffer(decoded_bytes, dtype=dtype).tolist()
        except Exception as e:
            print(f"Error decoding binary block: {e}")
            return []
            
    return []

def extract_and_decode(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            html = f.read()
    except:
        with open(filepath, 'r', encoding='latin-1') as f:
            html = f.read()

    # 1. Regex to find the JSON data block inside Plotly.newPlot(...)
    # We look for the 2nd argument which is the data array
    pattern = r"Plotly\.newPlot\(\s*['\"].*?['\"]\s*,\s*(\[[\s\S]*?\])\s*,\s*\{"
    match = re.search(pattern, html)

    if not match:
        print("Could not find Plotly.newPlot call via Regex. Trying BS4/Script tags...")
        soup = BeautifulSoup(html, 'html.parser')
        scripts = soup.find_all('script', type='application/json')
        for s in scripts:
            try:
                js = json.loads(s.string)
                if 'x' in js and 'data' in js['x']:
                    return js['x']['data']
            except: pass
        return None

    # 2. Parse the JSON string
    json_str = match.group(1)
    try:
        data_raw = json.loads(json_str)
    except json.JSONDecodeError:
        # Simple attempt to fix JS quotes
        try:
            data_raw = json.loads(json_str.replace("'", '"'))
        except:
            print("Failed to parse the extracted JSON string.")
            return None

    # 3. Process the traces
    cleaned_rows = []
    
    print(f"Found {len(data_raw)} layers. decoding...")
    
    for trace in data_raw:
        # Check if this trace has coordinates (skip background maps/shapes)
        if 'lat' not in trace or 'lon' not in trace:
            continue
            
        # --- DECODING HAPPENS HERE ---
        lats = decode_bdata(trace['lat'])
        lons = decode_bdata(trace['lon'])
        
        # If decoding failed or empty, skip
        if not lats or not lons:
            continue

        # Get Text/Labels
        texts = trace.get('text', [])
        if not texts: texts = trace.get('hovertext', [])
        # Ensure text is a list
        if isinstance(texts, str): texts = [texts] * len(lats)
        if not texts: texts = [""] * len(lats)

        # --- EXTRACT MAGNITUDE ---
        # Magnitude is usually in trace['marker']['size'] or trace['marker']['color']
        mags = []
        marker = trace.get('marker', {})
        
        # Try Size first
        if 'size' in marker:
            mags = decode_bdata(marker['size'])
        # Try Color if size is missing/constant
        elif 'color' in marker and isinstance(marker['color'], (list, dict)):
            mags = decode_bdata(marker['color'])
        
        # If single value or missing, fill with N/A
        if not isinstance(mags, list) or len(mags) != len(lats):
            mags = [None] * len(lats)

        group_name = trace.get('name', 'Unknown')

        # Zip it all together
        for lat, lon, txt, mag in zip(lats, lons, texts, mags):
            cleaned_rows.append({
                "Group": group_name,
                "Latitude": lat,
                "Longitude": lon,
                "Magnitude_or_Size": mag,
                "Description": txt
            })
            
    return pd.DataFrame(cleaned_rows)

# ==========================================
# RUN
# ==========================================
print(f"Processing {file_path}...")
df = extract_and_decode(file_path)

if df is not None and not df.empty:
    print("="*40)
    print(f"SUCCESS! Extracted {len(df)} locations.")
    print("="*40)
    print(df.head())
    df.to_csv("final_earthquake_data.csv", index=False)
else:
    print("No data found. The file might not contain point data.")

Processing C:\Users\rajoshi\Downloads\1_grafico_interattivo_regioni_reali.html...
Found 428 layers. decoding...
SUCCESS! Extracted 76092 locations.
             Group   Latitude  Longitude  Magnitude_or_Size      Description
0  Poliambulatori   41.905781  12.887392          2165610.0  Poliambulatori 
1  Poliambulatori   41.999815  12.726347           159000.0  Poliambulatori 
2  Poliambulatori   42.051643  12.616739           127000.0  Poliambulatori 
3  Poliambulatori   42.067010  12.765451           690000.0  Poliambulatori 
4  Poliambulatori   41.860174  13.032261           100000.0  Poliambulatori 
