## 1. Collect Articles

### Next, we collect recent articles by scraping different sites.


EDN Network, https://www.edn.com/?s=Practical+design+articles+and+component+selection+guides

EE Times, https://www.eetimes.com/?s=New+component+announcements%2C+technology+trends%2C+and+design+techniques

Electronic Design, https://www.electronicdesign.com/search?page=1&filters=%7B%22text%22%3A%22Design%20methodologies%20and%20component%20comparisons%22%2C%22page%22%3A1%2C%22status%22%3A%5B%221%22%5D%2C%22impliedSchedules%22%3Atrue%7D&sort=score

Electronics Weekly, https://www.electronicsweekly.com/?s=Product+launches+and+industry+news&type=&category=0&year=0&orderby=name


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import re
from urllib.parse import urljoin, urlparse
import csv
import os

In [10]:
class ArticleDataCollector:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })

    def make_request(self, url, retries=3):
        for attempt in range(retries):
            try:
                print(f"Fetching: {url} (Attempt {attempt + 1})")
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                return response
            except requests.RequestException as e:
                print(f"  Error: {e}")
                time.sleep(5)
        return None

    def extract_article_data(self, element, base_url):
        try:
            title = (element.find(['h1', 'h2', 'h3', 'h4', 'h5']) or
                     element.find(class_=re.compile("title|headline", re.I)) or
                     element.get('title') or
                     element.get('aria-label') or
                     element.get_text(strip=True))[:100]

            link_tag = element if element.name == 'a' and element.get('href') else element.find('a', href=True)
            link = urljoin(base_url, link_tag['href']) if link_tag else ''

            date_elem = element.find(['time', 'span', 'div'], 
                class_=re.compile(r'date|time|publish|created|updated', re.I)) or \
                element.find(attrs={'datetime': True}) or \
                element.find(attrs={'data-date': True})
            date_str = date_elem.get('datetime') or date_elem.get('data-date') or date_elem.get_text(strip=True) if date_elem else ''

            return title.strip(), link.strip(), date_str.strip()
        except Exception as e:
            print(f"  Extraction error: {e}")
            return "", "", ""

    def scrape_website(self, url, source):
        articles, seen_links = [], set()
        response = self.make_request(url)
        if not response:
            return articles

        soup = BeautifulSoup(response.content, 'html.parser')
        print(f"  Title: {soup.title.string if soup.title else 'No title'}")

        selectors = [
            'article', '.post', '.story', '.entry', '.search-result',
            '.article-item', '.news-item', '.content-item',
            '[class*="post"]', '[class*="article"]', '[class*="story"]',
            '[class*="result"]', '[class*="item"]', '[class*="news"]'
        ]

        for selector in selectors:
            elements = soup.select(selector)
            if elements:
                print(f"  Found {len(elements)} articles with selector: {selector}")
                break
        else:
            elements = soup.find_all(['main', 'section', 'div'], class_=re.compile('content|articles|posts|results', re.I))
            elements = elements[0].find_all('a', href=True) if elements else soup.find_all('a', href=True)
            print(f"  Fallback: Found {len(elements)} links")

        for element in elements[:50]:
            title, link, date_str = self.extract_article_data(element, url)
            if title and link and len(title) > 10 and link not in seen_links:
                if not any(skip in link.lower() for skip in ['javascript:', 'mailto:', '#', 'tel:']):
                    seen_links.add(link)
                    articles.append({
                        'title': title,
                        'link': link,
                        'date_raw': date_str or "Date not found",
                        'source': source,
                        'scraped_at': datetime.now().isoformat()
                    })
                    print(f"    Collected: {title[:50]}")

        return articles

    def collect_from_sources(self, csv_path):
        if not os.path.exists(csv_path):
            print(f"CSV not found: {csv_path}")
            return []

        df = pd.read_csv(csv_path)
        print(f"Loaded {len(df)} sources")

        all_articles = []
        for _, row in df.iterrows():
            source, url = str(row.get('Article', '')).strip(), str(row.get('Link', '')).strip()
            if not url or url.lower() == 'nan':
                continue
            print(f"\nScraping {source}...")
            all_articles.extend(self.scrape_website(url, source))
            time.sleep(3)  # Respectful delay
        return all_articles

    def get_summary(self, articles):
        if not articles:
            return "No articles collected."

        summary = {
            'total_articles': len(articles),
            'by_source': {},
            'with_dates': sum(1 for a in articles if a['date_raw'] != "Date not found"),
            'without_dates': sum(1 for a in articles if a['date_raw'] == "Date not found")
        }

        for a in articles:
            summary['by_source'][a['source']] = summary['by_source'].get(a['source'], 0) + 1

        return summary
    
def main():
    scraper = ArticleScraper()
    
    # Scrape articles from all sources
    articles = scraper.scrape_all_sources('./data/article-links.csv')
    
    # Display results
    print(f"\n=== SCRAPING RESULTS ===")
    print(f"Total articles found: {len(articles)}")
    
    if articles:
        
        # Save to CSV
        scraper.save_results(articles)
        
        # Group by source for summary
        by_source = {}
        for article in articles:
            source = article['source']
            if source not in by_source:
                by_source[source] = 0
            by_source[source] += 1
        
        print("Articles by source:")
        for source, count in by_source.items():
            print(f"  {source}: {count} articles")
    else:
        print("No recent articles found. This might be due to:")
        print("- Website structure changes")
        print("- Anti-scraping measures")
        print("- Network issues")
        print("- No articles from May 2025 specifically")
        print("- Date parsing difficulties")
        print("\nTip: Check if the websites have articles from May 2025, as we're specifically filtering for that month.")

if __name__ == "__main__":
    main()

Scraping EDN Network...
  Attempting to fetch: https://www.edn.com/?s=Practical+design+articles+and+component+selection+guides (attempt 1)
  Request error on attempt 1: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
  Attempting to fetch: https://www.edn.com/?s=Practical+design+articles+and+component+selection+guides (attempt 2)
  Timeout on attempt 2
  Attempting to fetch: https://www.edn.com/?s=Practical+design+articles+and+component+selection+guides (attempt 3)
  Timeout on attempt 3
Found 0 articles from May 2025 from EDN Network
Scraping EE Times...
  Attempting to fetch: https://www.eetimes.com/?s=New+component+announcements%2C+technology+trends%2C+and+design+techniques (attempt 1)
  Request error on attempt 1: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
  Attempting to fetch: https://www.eetimes.com/?s=New+component+announcements%2C+technology+trends%2C+and+design+techniques (attempt 2

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


  Page title: No title found
  Fallback: Found 0 links
Found 0 articles from May 2025 from Electronic Design
Scraping Electronics Weekly...
  Attempting to fetch: https://www.electronicsweekly.com/?s=Product+launches+and+industry+news&type=&category=0&year=0&orderby=name (attempt 1)
  Page title: Search Results for :: Electronics Weekly
  Found 10 elements with selector: article
    Added: Elektra Awards 2025 open for entries...
    Added: NMI hosts industry conference in Glasgow with them...
    Added: DigiKey introduces own-brand DigiKey Standard prod...
    Added: CHIIPS podcast interview with industry veteran Ash...
    Added: Get Mannerisms, Gadget Master, the Daily and the W...
    Added: Elektra Awards 2025 looking for tech stars – compa...
    Added: IPC praises President Trump for defence industry s...
    Added: IPC sets the industry on the path to sustainabilit...
    Added: Imec launches Stuttgart advanced chip design accel...
    Added: Auto hi-voltage detector claims fast