In [22]:
import json
import pandas as pd
import asyncio
import logging
import os
import re
from crawl4ai import *
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from hashlib import sha256


In [30]:
FILE_PATH = 'crawled_porsche.json'
with open(FILE_PATH, 'r', encoding='utf-8') as file:
    data = json.load(file)

df = pd.DataFrame(data)

In [31]:
url_list = df['url'].tolist()

In [32]:
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)

In [33]:
def write_markdown_response(url, markdown):
    hash = sha256(url.encode()).hexdigest()[:255]
    filename = f'{hash}.md'
    with open(filename, 'w') as x_file:
        x_file.write(markdown)

In [34]:
async def product_scraper(url_list):
    # Create a simple filter chain
    filter_chain = FilterChain([
        # Only crawl pages within the blog section
        URLPatternFilter("*/product/*"),
        # Only process HTML pages
        ContentTypeFilter(["text/html"])
    ])

    md_generator = DefaultMarkdownGenerator(
        options={
            "ignore_links": True,
            "escape_html": False,
            "body_width": 80,
            "skip_internal_links": True,
        }
    )

    # Initialize the strategy with basic configuration
    strategy = BFSDeepCrawlStrategy(
        max_depth=10,                # Only go 2 levels deep
        include_external=False,    # Stay within the same domain
        max_pages=500,              # Maximum number of pages to crawl (optional)
        score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
    )

    config = CrawlerRunConfig(
        markdown_generator=md_generator
    )

    # Create the crawler and scraper
    async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler:
        
        word_count_threshold = 1000
        results = await crawler.arun_many(
            urls=url_list,
            word_count_threshold=word_count_threshold,
            bypass_cache=True,
            verbose=True,
            extraction_strategy=strategy,
            config=config
        )
        
        for result in results:
            if result.success:
                print(f"Successfully crawled: {result.url}")
                print(f"Title: {result.metadata.get('title', 'N/A')}")
                print(f"Word count: {len(result.markdown.split())}")
                print(
                    f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
                )
                print(f"Number of images: {len(result.media.get('images', []))}")
                write_markdown_response(result.url, result.markdown)
                print("---")
            else:
                print(f"Failed to crawl: {result.url}")
                print(f"Error: {result.error_message}")
                print("---")

In [35]:
await product_scraper(url_list)

Successfully crawled: https://www.arenaev.com/porsche_macan_4_winter_test-news-4288.php
Title: Porsche Macan 4 real life winter test - ArenaEV
Word count: 851
Number of links: 79
Number of images: 19
---
Successfully crawled: https://downtown-mag.com/en/porsche-taycan-vs-macan-comparison-test/
Title: Porsche Macan 4S vs. Taycan 4 Cross Turismo Compared
Word count: 7363
Number of links: 72
Number of images: 657
---
Successfully crawled: https://www.carmagazine.co.uk/car-reviews/porsche/macan-electric/
Title: Porsche Macan electric (2024) review: basic is best
Word count: 3478
Number of links: 185
Number of images: 66
---
Successfully crawled: https://www.macanevowners.com/forum/threads/our-porsche-macan-ev-turbo-test-drive-review-resultant-eliminated-options.17071/
Title: Our Porsche Macan EV Turbo Test Drive Review: Resultant Eliminated Options | Macan EV Forum, News, Info - MacanEVowners
Word count: 2971
Number of links: 127
Number of images: 42
---
Successfully crawled: https://www.c