In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

In [3]:
def create_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

In [4]:
def scrape_article_content(driver, url):
    try:
        driver.get(url)
        time.sleep(3)
        if '404' in driver.title.lower() or 'page not found' in driver.title.lower():
            return None
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article = soup.find('article')
        if article:
            paragraphs = article.find_all('p')
            content = ' '.join([p.get_text(strip=True) for p in paragraphs])
        else:
            paragraphs = soup.find_all('p')
            content = ' '.join([p.get_text(strip=True) for p in paragraphs])
        return content if content else 'No content found'
    except Exception as e:
        return f'Error: {str(e)}'

In [5]:
def scrape_articles_to_csv(input_csv, output_csv, limit=10, delay=2):
    df = pd.read_csv(input_csv).head(limit)
    results = []
    total = len(df)
    driver = create_driver()

    try:
        for idx, row in df.iterrows():
            article_id = row['id']
            title = row['title']
            url = row['url']

            print(f'Scraping {idx + 1}/{total}: {title[:60]}...')
            content = scrape_article_content(driver, url)
            if content is None:
                print(f'  ⚠ 404 - Skipped')
                continue
            results.append({
                'Id': article_id,
                'Title': title,
                'Content': content
            })
            time.sleep(delay)
    finally:
        driver.quit()

    result_df = pd.DataFrame(results)
    result_df.to_csv(output_csv, index=False)
    print(f'\nDone! Saved {len(result_df)} articles to {output_csv}')
    return result_df

In [None]:
input_file = '../medium_data.csv'
output_file = 'articles.csv'

df_result = scrape_articles_to_csv(input_file, output_file, limit=20, delay=2)


Scraping 1/20: Not All Rainbows and Sunshine: The Darker Side of ChatGPT...
Scraping 2/20: Ethics in AI: Potential Root Causes for Biased Algorithms...
Scraping 3/20: Python Tuple, The Whole Truth and Only the Truth: Let’s Dig ...
Scraping 4/20: Dates and Subqueries in SQL...
  ⚠ 404 - Skipped
Scraping 5/20: Temporal Differences with Python: First Sample-Based Reinfor...
  ⚠ 404 - Skipped
Scraping 6/20: Going Under the Hood of Character-Level RNNs: A NumPy-based ...
Scraping 7/20: ChatGPT isn’t all it seems, read this before you use it....
Scraping 8/20: 10 Subtle Strategies I Wish I Knew When I Had 23 Email Subsc...
Scraping 9/20: How To Start A Niche Site in Under 3 Hours (With Checklist)...
Scraping 10/20: Don’t Become a Full-Time Content Creator If You Have Low-Ris...
Scraping 11/20: Why Storytelling Is A Critical Skill For Startup Founders...
Scraping 12/20: Why My Side Hustle Stuck After 3 Years and What I Want to Do...
Scraping 13/20: Kill Your Perfectionism with an Abundance Mi