## 3. Retreive Article Data

### This files collect text data for articles scraped for future semantic matching.

In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin, urlparse
import logging

In [9]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_article_text(html_content, url):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
        script.decompose()
    selectors = [
        'article',
        '[role="main"]',
        '.article-content',
        '.post-content',
        '.entry-content',
        '.content',
        '.main-content',
        '#content',
        '.article-body',
        '.story-body'
    ]
    text_content = ""
    
    # Process each selector
    for selector in selectors:
        elements = soup.select(selector)
        if elements:
            for element in elements:
                text_content += element.get_text(separator=' ', strip=True) + " "
            break
    if not text_content.strip():
        paragraphs = soup.find_all('p')
        text_content = ' '.join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 50])
    if not text_content.strip():
        body = soup.find('body')
        if body:
            text_content = body.get_text(separator=' ', strip=True)
    return text_content.strip()

def scrape_article(url, max_retries=3):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()
            
            # Extract text content
            article_text = extract_article_text(response.content, url)
            
            if len(article_text) < 100:  # Warn if content is too short
                logging.warning(f"Short content for {url}: {len(article_text)} characters")
            return article_text
        except requests.exceptions.RequestException as e:
            logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                logging.error(f"Failed to scrape {url} after {max_retries} attempts")
                return None

In [11]:
def main():
    # Read the CSV file
    try:
        df = pd.read_csv('./intermediate_data/Scraped_Article_Links.csv')
        logging.info(f"Loaded {len(df)} articles from CSV")
    except Exception as e:
        logging.error(f"Error reading CSV: {str(e)}")
        return
    required_columns = ['title', 'link', 'date', 'source']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        logging.error(f"Missing required columns: {missing_columns}")
        return
    
    articles_data = []
    # Process each article
    for index, row in df.iterrows():
        logging.info(f"Processing article {index + 1}/{len(df)}: {row['title']}")
        article_text = scrape_article(row['link'])
        article_data = {
            'title': row['title'],
            'source': row['source'],
            'date': row['date'],
            'link': row['link'],
            'text': article_text
        }
        articles_data.append(article_data)
        time.sleep(1)
    
    # Save to JSON file
    output_filename = './intermediate_data/Scraped_Article_Data.json'
    try:
        with open(output_filename ,'w', encoding='utf-8') as f:
            json.dump(articles_data, f, indent=2, ensure_ascii=False)
        print(f"Output saved to: {output_filename}") 
    except Exception as e:
        logging.error(f"Error saving JSON file: {str(e)}")  
if __name__ == "__main__":
    main()  

2025-06-06 14:52:58,196 - INFO - Loaded 10 articles from CSV
2025-06-06 14:52:58,218 - INFO - Processing article 1/10: Elektra Awards 2025 open for entries
2025-06-06 14:53:00,170 - INFO - Processing article 2/10: NMI hosts industry conference in Glasgow with theme of growth
2025-06-06 14:53:03,227 - INFO - Processing article 3/10: DigiKey introduces own-brand DigiKey Standard product line
2025-06-06 14:53:05,050 - INFO - Processing article 4/10: CHIIPS podcast interview with industry veteran Ash Madni
2025-06-06 14:53:06,719 - INFO - Processing article 5/10: Get Mannerisms, Gadget Master, the Daily and the Weekly, in newsletter form
2025-06-06 14:53:08,473 - INFO - Processing article 6/10: Elektra Awards 2025 looking for tech stars – companies, entrepreneurs and designers
2025-06-06 14:53:10,211 - INFO - Processing article 7/10: IPC praises President Trump for defence industry support
2025-06-06 14:53:11,811 - INFO - Processing article 8/10: IPC sets the industry on the path to sustai

Output saved to: ./intermediate_data/Scraped_Article_Data.json
