# **TASK-1 B**

In [1]:
categories = {
    "Technology": [
        "https://techcrunch.com",
        "https://thenextweb.com",
        "https://www.wired.com"
    ],
    "Health": [
        "https://www.medicalnewstoday.com",
        "https://www.healthline.com",
        "https://www.webmd.com"
    ],
    "Sports": [
        "https://www.espn.com",
        "https://www.bbc.com/sport",
        "https://www.sportskeeda.com"
    ],
    "Finance": [
        "https://www.investopedia.com",
        "https://www.forbes.com",
        "https://www.bloomberg.com"
    ],
    "Science": [
        "https://www.sciencenews.org",
        "https://www.scientificamerican.com",
        "https://www.nature.com"
    ],
    "Entertainment": [
        "https://www.hollywoodreporter.com",
        "https://www.variety.com",
        "https://www.rottentomatoes.com"
    ],
    "Politics": [
        "https://www.politico.com",
        "https://www.nbcnews.com/politics",
        "https://www.reuters.com/politics"
    ],
    "Education": [
        "https://www.edutopia.org",
        "https://www.timeshighereducation.com",
        "https://www.theguardian.com/education"
    ],
    "Travel": [
        "https://www.lonelyplanet.com",
        "https://www.travelandleisure.com",
        "https://www.nationalgeographic.com/travel"
    ],
    "Gaming": [
        "https://www.pcgamer.com",
        "https://www.ign.com",
        "https://www.gamespot.com"
    ],
    "Automobile": [
        "https://www.caranddriver.com",
        "https://www.motortrend.com",
        "https://www.autoblog.com"
    ],
    "Fashion": [
        "https://www.vogue.com",
        "https://www.elle.com",
        "https://www.gq.com"
    ],
    "Food": [
        "https://www.bonappetit.com",
        "https://www.foodnetwork.com",
        "https://www.epicurious.com"
    ],
    "Cryptocurrency": [
        "https://www.coindesk.com",
        "https://cointelegraph.com",
        "https://www.cryptoslate.com"
    ],
    "History": [
        "https://www.history.com",
        "https://www.bbc.co.uk/history",
        "https://www.smithsonianmag.com/history"
    ],
    "Environment": [
        "https://www.nationalgeographic.com/environment",
        "https://www.sciencedaily.com/news/earth_climate",
        "https://www.worldwildlife.org"
    ],
    "Movies": [
        "https://www.imdb.com",
        "https://www.boxofficemojo.com",
        "https://www.fandango.com"
    ],
    "Artificial Intelligence": [
        "https://www.analyticsvidhya.com",
        "https://www.aitrends.com",
        "https://www.deepmind.com"
    ],
    "Cybersecurity": [
        "https://www.darkreading.com",
        "https://www.csoonline.com",
        "https://www.bleepingcomputer.com"
    ],
    "Business": [
        "https://www.wsj.com",
        "https://www.businessinsider.com",
        "https://hbr.org"
    ]
}


In [None]:
import os
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

# Create directory for output
os.makedirs("scraped_data", exist_ok=True)

def extract_date(soup):
    """Extract article date from meta tags."""
    date_tags = ["article:published_time", "date", "publish-date", "pubdate"]
    for tag in date_tags:
        date_meta = soup.find("meta", {"property": tag}) or soup.find("meta", {"name": tag})
        if date_meta and date_meta.get("content"):
            return date_meta["content"]
    return datetime.today().strftime("%Y-%m-%d")  # Default to today's date

def clean_text(text):
    """Cleans text by removing unnecessary spaces and line breaks."""
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    return text.strip()

def scrape_and_save(category, urls):
    structured_data = []

    for url in urls:
        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            title = soup.title.string.strip() if soup.title else "Untitled Article"
            date = extract_date(soup)
            paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
            content = "\n".join(paragraphs)

            structured_data.append(f"=== {title} ===\nDate: {date}\nSource: {url}\n--------------------\n{clean_text(content)}\n\n")

        except requests.RequestException as e:
            print(f"Error scraping {url}: {e}")

    file_path = os.path.join("scraped_data", f"{category}111.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.writelines(structured_data)

    print(f"Structured data saved for category: {category}")



# Scrape data
for category, urls in categories.items():
    scrape_and_save(category, urls)
