In [1]:
from urllib.parse import urljoin  
from selenium import webdriver  
from selenium.webdriver.chrome.service import Service  
from selenium.webdriver.common.by import By  
from selenium.webdriver.support.ui import WebDriverWait  
from selenium.webdriver.support import expected_conditions as EC  
from bs4 import BeautifulSoup  
import pandas as pd  
import time  

def setup_driver():  
    """Configure and return a headless Chrome driver"""  
    options = webdriver.ChromeOptions()  
    options.add_argument("--headless")  
    options.add_argument("--disable-gpu")  
    options.add_argument("--no-sandbox")  
    options.add_argument("--disable-dev-shm-usage")  
    options.add_argument("--window-size=1920x1080")  
    
    return webdriver.Chrome(options=options)  



In [None]:
# url = "https://www.news-medical.net/Clinical-and-Diagnostics"
from selenium.common.exceptions import TimeoutException
BASE_URL = 'https://www.news-medical.net/'


def get_links_title(url):
    driver.get(url)
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'expand-item')))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    data_entries = []
    for block in soup.find_all('div', class_='expand-item'):
        for item in block.find_all('a'):
            title = item.text.strip()
            link = urljoin(BASE_URL, item['href'])
            
            
                
            content = get_content(link)
            if content:  # Only add valid entries
                data_entries.append({
                    'title': title,
                    'link': link,
                    'content': content
                })
            time.sleep(1.5)  # Reduced delay since we're skipping invalid pages
            
    return data_entries

def get_content(url):
    driver.get(url)
    try:
        # Fixed CSS selector (added dot)
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".main-container"))
        )
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Extract content
        content_div = soup.find('main', class_='main-container')
        content = content_div.text.strip() if content_div else "NaN"
        
        
        return content
        
    except TimeoutException:
        print(f"Timeout at {url}")
        return "NaN"  # Return tuple with placeholders
    except Exception as e:
        print(f"Error at {url}: {str(e)}")
        return "NaN"

def main():
    driver = setup_driver()
    data = get_links_title("https://www.news-medical.net/Clinical-and-Diagnostics")
    # print(data)
    df = pd.DataFrame(data)
    df.to_csv('news_medical_v4.csv', index=False)


if __name__ == "__main__":
    main()
    driver.quit()


