In [None]:
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
import os

load_dotenv()

app = FirecrawlApp(api_key=os.getenv('firecrawl_api_key'))

# Scrape with both text and screenshot
result = app.scrape_url(
    'https://en.wikipedia.org/wiki/Attention_Is_All_You_Need',
    formats=['markdown', 'screenshot', 'links']
)

print("Text content:", result['markdown'])
print("Screenshot:", result['screenshot'])  # Base64 encoded image
print("Links:", result['links'])    

In [3]:
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
import os
import base64
import json
from datetime import datetime
import requests

load_dotenv()

app = FirecrawlApp(api_key=os.getenv('firecrawl_api_key'))

# Scrape the content
result = app.scrape_url(
    'https://en.wikipedia.org/wiki/Attention_Is_All_You_Need',
    formats=['markdown', 'screenshot', 'links']
)

# Create a directory for the scraped data
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"scraped_data_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# Save markdown content - use dot notation instead of dictionary access
with open(f"{output_dir}/content.md", 'w', encoding='utf-8') as f:
    f.write(result.markdown)

# Save screenshot - handle both URL and base64 data
if hasattr(result, 'screenshot') and result.screenshot:
    try:
        screenshot_data = result.screenshot
        
        # Check if it's a URL or base64 data
        if screenshot_data.startswith('http'):
            # It's a URL, download the image
            print(f"Downloading screenshot from: {screenshot_data}")
            response = requests.get(screenshot_data)
            response.raise_for_status()
            
            with open(f"{output_dir}/screenshot.png", 'wb') as f:
                f.write(response.content)
            print("Screenshot downloaded and saved successfully")
            
        elif screenshot_data.startswith('data:image'):
            # It's base64 data, decode it
            screenshot_data = screenshot_data.split(',')[1]
            
            # Fix base64 padding if needed
            def fix_base64_padding(data):
                missing_padding = len(data) % 4
                if missing_padding:
                    data += '=' * (4 - missing_padding)
                return data
            
            screenshot_data = fix_base64_padding(screenshot_data)
            
            with open(f"{output_dir}/screenshot.png", 'wb') as f:
                f.write(base64.b64decode(screenshot_data))
            print("Screenshot decoded and saved successfully")
        else:
            print(f"Unknown screenshot format: {screenshot_data[:50]}...")
            
    except Exception as e:
        print(f"Error saving screenshot: {e}")
        # Save the raw screenshot data for debugging
        with open(f"{output_dir}/screenshot_debug.txt", 'w') as f:
            f.write(f"Screenshot data length: {len(result.screenshot)}\n")
            f.write(f"First 100 chars: {result.screenshot[:100]}\n")
            f.write(f"Raw data: {result.screenshot}")
else:
    print("No screenshot data available")

# Save links as JSON - use dot notation
if hasattr(result, 'links') and result.links:
    with open(f"{output_dir}/links.json", 'w', encoding='utf-8') as f:
        json.dump(result.links, f, indent=2)
    print(f"Saved {len(result.links)} links")
else:
    print("No links data available")

# Save all metadata - use dot notation
metadata = {
    'url': 'https://transformer-circuits.pub/2025/attribution-graphs/biology.html',
    'scraped_at': datetime.now().isoformat(),
    'metadata': result.metadata if hasattr(result, 'metadata') else {},
    'total_links': len(result.links) if hasattr(result, 'links') and result.links else 0,
    'has_screenshot': hasattr(result, 'screenshot') and bool(result.screenshot),
    'screenshot_type': 'url' if (hasattr(result, 'screenshot') and result.screenshot and result.screenshot.startswith('http')) else 'base64' if (hasattr(result, 'screenshot') and result.screenshot and result.screenshot.startswith('data:')) else 'unknown'
}

with open(f"{output_dir}/metadata.json", 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2)

print(f"Data saved to directory: {output_dir}")
print(f"Content size: {len(result.markdown)} characters")
print(f"Links found: {len(result.links) if hasattr(result, 'links') and result.links else 0}")
print(f"Screenshot: {'Available' if hasattr(result, 'screenshot') and result.screenshot else 'Not available'}")

Downloading screenshot from: https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-09f7059d-4742-424f-89c4-406bba897b7f.png
Screenshot downloaded and saved successfully
Saved 336 links
Data saved to directory: scraped_data_20250614_154141
Content size: 62026 characters
Links found: 336
Screenshot: Available
Screenshot downloaded and saved successfully
Saved 336 links
Data saved to directory: scraped_data_20250614_154141
Content size: 62026 characters
Links found: 336
Screenshot: Available
