## 2. Collect Articles

### Next, we collect recent articles by scraping different sites such as EDN Network, EE Times, Electronic Design, Electronics Weekly


EDN Network, https://www.edn.com/?s=Practical+design+articles+and+component+selection+guides

EE Times, https://www.eetimes.com/?s=New+component+announcements%2C+technology+trends%2C+and+design+techniques

Electronic Design, https://www.electronicdesign.com/search?page=1&filters=%7B%22text%22%3A%22Design%20methodologies%20and%20component%20comparisons%22%2C%22page%22%3A1%2C%22status%22%3A%5B%221%22%5D%2C%22impliedSchedules%22%3Atrue%7D&sort=score

Electronics Weekly, https://www.electronicsweekly.com/?s=Product+launches+and+industry+news&type=&category=0&year=0&orderby=name


In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import re
from urllib.parse import urljoin, urlparse
import csv
import os

In [17]:
class ArticleScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0'
        })
        self.target_month = 5 
        self.target_year = 2025
        
    def parse_date(self, date_str):
        """Parse various date formats commonly found on websites"""
        if not date_str:
            return None
        # Clean the date strings
        date_str = date_str.strip().lower()
        patterns = [
            r'(\d{1,2})/(\d{1,2})/(\d{4})',  # MM/DD/YYYY
            r'(\d{4})-(\d{1,2})-(\d{1,2})',  # YYYY-MM-DD
            r'(\d{1,2})-(\d{1,2})-(\d{4})',  # DD-MM-YYYY
            r'(\w+)\s+(\d{1,2}),?\s+(\d{4})',  # Month DD, YYYY
        ]
        
        for pattern in patterns:
            match = re.search(pattern, date_str)
            if match:
                try:
                    if 'month' in pattern:
                        month_name, day, year = match.groups()
                        month_dict = {
                            'january': 1, 'february': 2, 'march': 3, 'april': 4,
                            'may': 5, 'june': 6, 'july': 7, 'august': 8,
                            'september': 9, 'october': 10, 'november': 11, 'december': 12,
                            'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                            'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                        }
                        month = month_dict.get(month_name.lower())
                        if month:
                            return datetime(int(year), month, int(day))
                    else:
                        parts = match.groups()
                        if len(parts) == 3:
                            if len(parts[0]) == 4:  # YYYY-MM-DD
                                return datetime(int(parts[0]), int(parts[1]), int(parts[2]))
                            else:  # MM/DD/YYYY or DD-MM-YYYY
                                return datetime(int(parts[2]), int(parts[0]), int(parts[1]))
                except (ValueError, TypeError):
                    continue
        
        return None
    
    def is_recent(self, date_obj):
        if not date_obj:
            return True  # Include articles without clear dates for manual review
        return date_obj.year == self.target_year and date_obj.month == self.target_month
    
    def make_request(self, url, max_retries=3):
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=30)  # Increased timeout
                response.raise_for_status()
                return response
            except requests.exceptions.Timeout:
                if attempt < max_retries - 1:
                    time.sleep(5)  # Wait before retry
                continue
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    time.sleep(5)
                continue
        return None
    
    def scrape_edn_network(self, url):
        articles = []
        try:
            response = self.make_request(url)
            if not response:
                return articles
                
            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Page title: {soup.title.string if soup.title else 'No title found'}")
            selectors = [
                'article',
                '.post',
                '.entry',
                '.search-result',
                '.article-item',
                '[class*="post"]',
                '[class*="article"]'
            ]
            article_elements = []
            for selector in selectors:
                elements = soup.select(selector)
                if elements:
                    article_elements = elements
                    print(f"  Found {len(elements)} elements with selector: {selector}")
                    break
            
            if not article_elements:
                article_elements = soup.find_all('a', href=True)
                print(f"  Fallback: Found {len(article_elements)} links")
            for element in article_elements[:30]:  # Increased limit
                title = ""
                link = ""
                date_str = ""
                if hasattr(element, 'get_text'):
                    title_elem = element.find(['h1', 'h2', 'h3', 'h4', 'h5'])
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                    elif element.name == 'a':
                        title = element.get_text(strip=True)
                    else:
                        title = element.get_text(strip=True)[:100]
                if element.name == 'a' and element.get('href'):
                    link = urljoin(url, element['href'])
                else:
                    link_elem = element.find('a', href=True)
                    if link_elem:
                        link = urljoin(url, link_elem['href'])
                date_elem = element.find(['time', 'span', 'div'], class_=re.compile(r'date|time|publish', re.I))
                if date_elem:
                    date_str = date_elem.get_text(strip=True)
                if title and link and len(title) > 10:  
                    date_obj = self.parse_date(date_str)
                    
                    if self.is_recent(date_obj):
                        articles.append({
                            'title': title,
                            'link': link,
                            'date': date_str or "Date not found",
                            'source': 'EDN Network'
                        })
                        print(f"    Added: {title[:50]}...")  
        except Exception as e:
            print(f"Error scraping EDN Network: {e}")
        return articles
    
    def scrape_ee_times(self, url):
        """Scrape EE Times articles"""
        articles = []
        try:
            response = self.make_request(url)
            if not response:
                return articles
            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Page title: {soup.title.string if soup.title else 'No title found'}")
            
            # Multiple selectors to try
            selectors = [
                'article',
                '.story',
                '.post',
                '.search-result',
                '.article-item',
                '[class*="story"]',
                '[class*="post"]'
            ]
            
            article_elements = []
            for selector in selectors:
                elements = soup.select(selector)
                if elements:
                    article_elements = elements
                    print(f"  Found {len(elements)} elements with selector: {selector}")
                    break
            
            if not article_elements:
                article_elements = soup.find_all('a', href=True)
                print(f"  Fallback: Found {len(article_elements)} links")
            
            for element in article_elements[:30]:
                title = ""
                link = ""
                date_str = ""
                
                # Extract title
                if hasattr(element, 'get_text'):
                    title_elem = element.find(['h1', 'h2', 'h3', 'h4', 'h5'])
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                    elif element.name == 'a':
                        title = element.get_text(strip=True)
                    else:
                        title = element.get_text(strip=True)[:100]
                
                # Extract link
                if element.name == 'a' and element.get('href'):
                    link = urljoin(url, element['href'])
                else:
                    link_elem = element.find('a', href=True)
                    if link_elem:
                        link = urljoin(url, link_elem['href'])
                
                # Extract date
                date_elem = element.find(['time', 'span', 'div'], class_=re.compile(r'date|time|publish', re.I))
                if date_elem:
                    date_str = date_elem.get_text(strip=True)
                if title and link and len(title) > 10:
                    date_obj = self.parse_date(date_str)
                    if self.is_recent(date_obj):
                        articles.append({
                            'title': title,
                            'link': link,
                            'date': date_str or "Date not found",
                            'source': 'EE Times'
                        })
                        print(f"    Added: {title[:50]}...")
                            
        except Exception as e:
            print(f"Error scraping EE Times: {e}")  
        return articles
    
    def scrape_electronic_design(self, url):
        """Scrape Electronic Design articles"""
        articles = []
        try:
            response = self.make_request(url)
            if not response:
                return articles   
            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Page title: {soup.title.string if soup.title else 'No title found'}")
            # Multiple selectors to try
            selectors = [
                '.search-result',
                '.result-item',
                'article',
                '.post',
                '.item',
                '[class*="result"]',
                '[class*="item"]'
            ]
            
            article_elements = []
            for selector in selectors:
                elements = soup.select(selector)
                if elements:
                    article_elements = elements
                    print(f"  Found {len(elements)} elements with selector: {selector}")
                    break
            
            if not article_elements:
                article_elements = soup.find_all('a', href=True)
                print(f"  Fallback: Found {len(article_elements)} links")
            for element in article_elements[:30]:
                title = ""
                link = ""
                date_str = ""
                if hasattr(element, 'get_text'):
                    title_elem = element.find(['h1', 'h2', 'h3', 'h4', 'h5'])
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                    elif element.name == 'a':
                        title = element.get_text(strip=True)
                    else:
                        title = element.get_text(strip=True)[:100]
                
                # Extract link
                if element.name == 'a' and element.get('href'):
                    link = urljoin(url, element['href'])
                else:
                    link_elem = element.find('a', href=True)
                    if link_elem:
                        link = urljoin(url, link_elem['href'])
                
                # Extract date
                date_elem = element.find(['time', 'span', 'div'], class_=re.compile(r'date|time|publish', re.I))
                if date_elem:
                    date_str = date_elem.get_text(strip=True)
                
                if title and link and len(title) > 10:
                    date_obj = self.parse_date(date_str)
                    
                    if self.is_recent(date_obj):
                        articles.append({
                            'title': title,
                            'link': link,
                            'date': date_str or "Date not found",
                            'source': 'Electronic Design'
                        })
                        print(f"    Added: {title[:50]}...")
                            
        except Exception as e:
            print(f"Error scraping Electronic Design: {e}") 
        return articles
    
    def scrape_electronics_weekly(self, url):
        """Scrape Electronics Weekly articles"""
        articles = []
        try:
            response = self.make_request(url)
            if not response:
                return articles
            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Page title: {soup.title.string if soup.title else 'No title found'}")
            
            # Multiple selectors to try
            selectors = [
                'article',
                '.post',
                '.story',
                '.search-result',
                '.article-item',
                '[class*="post"]',
                '[class*="story"]'
            ]
            
            article_elements = []
            for selector in selectors:
                elements = soup.select(selector)
                if elements:
                    article_elements = elements
                    print(f"  Found {len(elements)} elements with selector: {selector}")
                    break
            
            if not article_elements:
                article_elements = soup.find_all('a', href=True)
                print(f"  Fallback: Found {len(article_elements)} links")
            
            for element in article_elements[:30]:
                title = ""
                link = ""
                date_str = ""
                if hasattr(element, 'get_text'):
                    title_elem = element.find(['h1', 'h2', 'h3', 'h4', 'h5'])
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                    elif element.name == 'a':
                        title = element.get_text(strip=True)
                    else:
                        title = element.get_text(strip=True)[:100]
                
                # Extract link
                if element.name == 'a' and element.get('href'):
                    link = urljoin(url, element['href'])
                else:
                    link_elem = element.find('a', href=True)
                    if link_elem:
                        link = urljoin(url, link_elem['href'])
                
                # Extract date
                date_elem = element.find(['time', 'span', 'div'], class_=re.compile(r'date|time|publish', re.I))
                if date_elem:
                    date_str = date_elem.get_text(strip=True)
                
                if title and link and len(title) > 10:
                    date_obj = self.parse_date(date_str)
                    
                    if self.is_recent(date_obj):
                        articles.append({
                            'title': title,
                            'link': link,
                            'date': date_str or "Date not found",
                            'source': 'Electronics Weekly'
                        })
                        print(f"    Added: {title[:50]}...")
                            
        except Exception as e:
            print(f"Error scraping Electronics Weekly: {e}")
        return articles
    
    def scrape_all_sources(self, csv_file_path):
        """Main function to scrape all sources from CSV"""
        all_articles = []
        
        try:
            df = pd.read_csv(csv_file_path)
            
            for index, row in df.iterrows():
                source_name = row['Article'].strip()
                url = row['Link'].strip()
                if 'EDN Network' in source_name:
                    articles = self.scrape_edn_network(url)
                elif 'EE Times' in source_name:
                    articles = self.scrape_ee_times(url)
                elif 'Electronic Design' in source_name:
                    articles = self.scrape_electronic_design(url)
                elif 'Electronics Weekly' in source_name:
                    articles = self.scrape_electronics_weekly(url)
                else:
                    print(f"Unknown source: {source_name}")
                    continue
                
                all_articles.extend(articles)
                print(f"Found {len(articles)} articles from May 2025 from {source_name}")
                
                # Be respectful - add delay between requests
                time.sleep(3)  # Increased delay
                
        except Exception as e:
            print(f"Error reading CSV file: {e}")
            return []
        
        return all_articles
    
    def save_results(self, articles, output_file='./intermediate_data/Scraped_Article_Links.csv'):
        if not articles:
            print("No articles found to save.")
            return
        df = pd.DataFrame(articles)
        df.to_csv(output_file, index=False)
        print(f"Saved {len(articles)} articles to {output_file}")

def main():
    scraper = ArticleScraper()
    articles = scraper.scrape_all_sources('./data/Article_Links.csv')
    print(f"\n--- SCRAPING RESULTS ---")
    print(f"Total articles found: {len(articles)}")
    if articles:
        scraper.save_results(articles)
        by_source = {}
        for article in articles:
            source = article['source']
            if source not in by_source:
                by_source[source] = 0
            by_source[source] += 1
        print("Articles by source:")
        for source, count in by_source.items():
            print(f"  {source}: {count} articles")
    else:
        print("No recent articles found. This might be due to:")
        
if __name__ == "__main__":
    main()

Found 0 articles from May 2025 from EDN Network
Found 0 articles from May 2025 from EE Times


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


  Page title: No title found
  Fallback: Found 0 links
Found 0 articles from May 2025 from Electronic Design
  Page title: Search Results for :: Electronics Weekly
  Found 10 elements with selector: article
    Added: Elektra Awards 2025 open for entries...
    Added: NMI hosts industry conference in Glasgow with them...
    Added: DigiKey introduces own-brand DigiKey Standard prod...
    Added: CHIIPS podcast interview with industry veteran Ash...
    Added: Get Mannerisms, Gadget Master, the Daily and the W...
    Added: Elektra Awards 2025 looking for tech stars – compa...
    Added: IPC praises President Trump for defence industry s...
    Added: IPC sets the industry on the path to sustainabilit...
    Added: Imec launches Stuttgart advanced chip design accel...
    Added: Auto hi-voltage detector claims fastest response t...
Found 10 articles from May 2025 from Electronics Weekly

--- SCRAPING RESULTS ---
Total articles found: 10
Saved 10 articles to ./intermediate_data/Scraped_A