In [5]:
import requests
from bs4 import BeautifulSoup
import json
import time

# Target: A high-quality brand strategy blog
# (Note: Always check a site's robots.txt before scraping)
BASE_URL = "https://www.ebaqdesign.com/blog" 

def scrape_brand_strategy():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
    response = requests.get(BASE_URL, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all article links (specific to Ebaqdesign's structure)
    articles = soup.find_all('a', class_='blog-post-title-link')
    
    training_data = []

    for art in articles[:10]: # Let's start with the first 10 articles
        link = art['href']
        if not link.startswith('http'):
            link = "https://www.ebaqdesign.com" + link
            
        print(f"Scraping: {link}")
        
        # Get the actual article content
        art_res = requests.get(link, headers=headers)
        art_soup = BeautifulSoup(art_res.text, 'html.parser')
        
        title = art_soup.find('h1').get_text(strip=True)
        # Grab paragraphs from the main content area
        paragraphs = art_soup.find_all('p')
        content = " ".join([p.get_text(strip=True) for p in paragraphs if len(p.get_text()) > 50])

        # Create 2-3 Q&A pairs per article to maximize the data
        if content:
            # Pair 1: General explanation
            training_data.append({
                "instruction": f"Explain the core concepts of {title}.",
                "output": content[:1000] # Keep it concise for a small model
            })
            # Pair 2: Summary request
            training_data.append({
                "instruction": f"Give me a summary of brand strategy regarding {title}.",
                "output": f"In the context of brand design, {title} focuses on: {content[:500]}..."
            })
            
        time.sleep(1) # Be a nice human, don't spam the server

    # Save as JSONL for the training code provided earlier
    with open('brand_strategy_data.jsonl', 'w', encoding='utf-8') as f:
        for entry in training_data:
            f.write(json.dumps(entry) + '\n')
            
    print(f"Success! Saved {len(training_data)} training pairs to brand_strategy_data.jsonl")

if __name__ == "__main__":
    scrape_brand_strategy()

Success! Saved 0 training pairs to brand_strategy_data.jsonl
