In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import os
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin
import time

class EfsynScraper:
    def __init__(self, base_url: str = "https://www.efsyn.gr"):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.seen_urls = set()  # Track processed URLs to avoid duplicates
        self.article_count = 0  # Track total articles processed
        self.MAX_ARTICLES = 500

    def _get_soup(self, url: str) -> BeautifulSoup:
        """Get BeautifulSoup object from URL."""
        response = self.session.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')

    def _parse_date(self, date_element) -> str:
        """Parse date from time element."""
        if not date_element:
            return ""
        datetime_str = date_element.get('datetime', '')
        return datetime_str

    def _clean_text(self, text: str) -> str:
        """Clean text content."""
        return ' '.join(text.strip().split())

    def _extract_article_content(self, soup: BeautifulSoup) -> str:
        """Extract article content, removing ads and scripts."""
        article_body = soup.find('div', class_='article__body')
        if not article_body:
            return ""
        
        # Remove all script tags and ad containers
        for element in article_body.find_all(['script', 'div'], class_='adv'):
            element.decompose()
        
        # Extract text from remaining paragraphs
        paragraphs = article_body.find_all('p')
        return ' '.join(self._clean_text(p.get_text()) for p in paragraphs)

    def _get_author(self, soup: BeautifulSoup) -> str:
        """Extract author name."""
        author_element = soup.find('span', class_='article__author')
        return self._clean_text(author_element.get_text()) if author_element else ""

    def _get_next_page_url(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract the next page URL."""
        next_link = soup.find('a', attrs={'rel': 'next'})
        if next_link:
            return urljoin(self.base_url + "/politiki", next_link.get('href', ''))
        return None

    def extract_article_data(self, article_element) -> Optional[Dict]:
        """Extract data from an article teaser."""
        try:
            # Find the link and get the relative URL
            link = article_element.find('a')
            if not link:
                return None
            
            relative_url = link.get('href', '')
            full_url = urljoin(self.base_url, relative_url)
            
            # Skip if we've already processed this URL
            if full_url in self.seen_urls:
                return None
            
            self.seen_urls.add(full_url)
            
            # Get article page content
            article_soup = self._get_soup(full_url)
            
            # Extract title (from h3 or h4)
            title_element = article_element.find(['h3', 'h4'])
            title = self._clean_text(title_element.get_text()) if title_element else ""
            
            # Extract date
            date_element = article_element.find('time', class_='default-date')
            issue_date = self._parse_date(date_element)
            
            return {
                "site_url": self.base_url + "/politiki",
                "issue_date": issue_date,
                "author_name": self._get_author(article_soup),
                "article_title": title,
                "article_content": self._extract_article_content(article_soup),
                "article_url": full_url  # Adding URL to help identify duplicates
            }
            
        except Exception as e:
            print(f"Error processing article: {e}")
            return None

    def scrape_politics_page(self) -> List[Dict]:
        """Scrape articles from all pages until reaching 500 articles."""
        articles = []
        current_url = f"{self.base_url}/politiki"
        page_number = 0
        
        while current_url and self.article_count < self.MAX_ARTICLES:
            print(f"Scraping page {page_number + 1}...")
            soup = self._get_soup(current_url)
            
            # Find all article elements with different classes
            article_elements = soup.find_all('article', class_=[
                'squareb-teaser',
                'squares-teaser',
                'square-teaser'
            ])
            
            for article_element in article_elements:
                if self.article_count >= self.MAX_ARTICLES:
                    break
                    
                article_data = self.extract_article_data(article_element)
                if article_data:
                    articles.append(article_data)
                    self.article_count += 1
                    print(f"Scraped article {self.article_count}/{self.MAX_ARTICLES}")
                
                # Add a small delay between requests
                time.sleep(1)
            
            # Get next page URL
            current_url = self._get_next_page_url(soup)
            page_number += 1
            
            # Add a delay between pages
            time.sleep(2)
        
        return articles

def remove_duplicates(articles: List[Dict]) -> List[Dict]:
    """Remove duplicate articles based on URL and title."""
    seen_urls = set()
    seen_titles = set()
    unique_articles = []
    
    for article in articles:
        url = article['article_url']
        title = article['article_title']
        
        if url not in seen_urls and title not in seen_titles:
            seen_urls.add(url)
            seen_titles.add(title)
            # Remove the article_url field as it was only used for deduplication
            del article['article_url']
            unique_articles.append(article)
    
    return unique_articles

def save_articles_to_json(articles: List[Dict], output_folder: str) -> str:
    """Save unique articles to a single JSON file in the specified folder."""
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Remove duplicates before saving
    unique_articles = remove_duplicates(articles)
    
    # Create filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"efsyn_articles_{timestamp}.json"
    
    # Create full file path
    filepath = os.path.join(output_folder, filename)
    
    # Save the articles
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(unique_articles, f, ensure_ascii=False, indent=4)
    
    return filepath, len(unique_articles)

# Example usage
if __name__ == "__main__":
    # Create scraper instance
    scraper = EfsynScraper()
    
    # Scrape articles
    articles = scraper.scrape_politics_page()
    
    # Specify your output folder and save the articles
    output_folder = "D:\\Web Scrapping Project\\efsyn_articles"
    saved_file, unique_count = save_articles_to_json(articles, output_folder)
    
    print(f"\nScraped {len(articles)} total articles")
    print(f"Saved {unique_count} unique articles to: {saved_file}")
    print(f"Removed {len(articles) - unique_count} duplicates")

Scraping page 1...
Scraped article 1/500
Scraped article 2/500
Scraped article 3/500
Scraped article 4/500
Scraped article 5/500
Scraped article 6/500
Scraped article 7/500
Scraped article 8/500
Scraped article 9/500
Scraped article 10/500
Scraped article 11/500
Scraped article 12/500
Scraped article 13/500
Scraped article 14/500
Scraped article 15/500
Scraped article 16/500
Scraped article 17/500
Scraped article 18/500
Scraped article 19/500
Scraped article 20/500
Scraped article 21/500
Scraped article 22/500
Scraped article 23/500
Scraped article 24/500
Scraped article 25/500
Scraped article 26/500
Scraped article 27/500
Scraped article 28/500
Scraped article 29/500
Scraped article 30/500
Scraped article 31/500
Scraping page 2...
Scraped article 32/500
Scraped article 33/500
Scraped article 34/500
Scraped article 35/500
Scraped article 36/500
Scraped article 37/500
Scraped article 38/500
Scraped article 39/500
Scraped article 40/500
Scraped article 41/500
Scraped article 42/500
Scrap