In [7]:
import requests
from bs4 import BeautifulSoup
import string
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

base_url = "http://www.diseasesdatabase.com/"
index_url = "disease_index_{}.asp"
diseases = []

# Set up a session with retry strategy
session = requests.Session()
retry = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

for letter in string.ascii_lowercase:
    url = base_url + index_url.format(letter)
    try:
        response = session.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the div with id 'page_specific_content'
        content_div = soup.find('div', id='page_specific_content')
        
        # Collect all <a> tags within this div
        if content_div:
            for a_tag in content_div.find_all('a'):
                href = a_tag.get('href')
                text = a_tag.text.strip()
                diseases.append((text, href))
        
        # Delay between requests
        time.sleep(1)
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

# Print or save the diseases list
for disease in diseases:
    print(disease)

('Familial thrombotic microangiopathy', 'ddb33291.htm')
('AA amyloidosis', 'ddb16.htm')
('Cholestasis-oedema syndrome, Norwegian type', 'ddb32129.htm')
('Reticular erythrokeratoderma', 'ddb35259.htm')
('SHORT syndrome', 'ddb30068.htm')
('Aarskog syndrome', 'ddb29329.htm')
('Aarskog syndrome', 'ddb29329.htm')
('Pyridoxine-dependent epilepsy', 'ddb34502.htm')
('Aase syndrome', 'ddb29332.htm')
('Aase syndrome', 'ddb29332.htm')
('Abacavir', 'ddb29945.htm')
('Abaloparatide', 'ddb36626.htm')
('Abametapir', 'ddb61883.htm')
('Abarelix', 'ddb33155.htm')
('Abatacept', 'ddb34101.htm')
('Low phospholipid-associated cholelithiasis', 'ddb65213.htm')
('ABCC8-related permanent neonatal diabetes mellitus', 'ddb34545.htm')
('ABCD syndrome', 'ddb33683.htm')
('Abciximab', 'ddb30046.htm')
('Abdominal distension', 'ddb30819.htm')
('Abdominal distension', 'ddb30819.htm')
('Abdominal mass', 'ddb14326.htm')
('Abdominal pain', 'ddb14367.htm')
('Abdominal distension', 'ddb30819.htm')
('Abdominal pain', 'ddb14367

In [9]:
import requests
from bs4 import BeautifulSoup
import string
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from typing import Dict, List, Tuple

def get_disease_details(session: requests.Session, diseases: List[Tuple[str, str]]) -> Dict[str, str]:
    disease_details = {}
    
    for disease_name, href in diseases:
        try:
            url = base_url + href if not href.startswith('http') else href
            response = session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            content_div = soup.find('div', id='page_specific_content')
            if not content_div:
                continue
                
            first_child = content_div.find_next()
            if not first_child:
                continue
                
            description = ""
            if first_child.name == 'blockquote':
                # Get all p tags with class squeeze
                paragraphs = first_child.find_all('p', class_='squeeze')
                description = ' '.join(p.text.strip() for p in paragraphs)
            else:
                description = first_child.text.strip()
                
            disease_details[disease_name] = description
            time.sleep(1)  # Be nice to the server
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching details for {disease_name}: {e}")
            continue
            
    return disease_details

# Example usage:
# Assuming 'session' and 'diseases' are already defined
disease_info = get_disease_details(session, diseases)

# Print results
for disease, description in disease_info.items():
    print(f"\n{disease}:")
    print(description)

Error fetching details for Familial thrombotic microangiopathy: HTTPConnectionPool(host='www.diseasesdatabase.com', port=80): Max retries exceeded with url: /ddb33291.htm (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))
Error fetching details for AA amyloidosis: HTTPConnectionPool(host='www.diseasesdatabase.com', port=80): Max retries exceeded with url: /ddb16.htm (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))
Error fetching details for Cholestasis-oedema syndrome, Norwegian type: HTTPConnectionPool(host='www.diseasesdatabase.com', port=80): Max retries exceeded with url: /ddb32129.htm (Caused by ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')))
Error fetching details for Reticular erythrokeratoderma: HTTPConnectionPool(host='www.diseasesdatabase.com', port=80): Max retries exceeded with url: /ddb35259.htm (Caused by ProtocolError('C

KeyboardInterrupt: 

In [10]:
diseases

[('Familial thrombotic microangiopathy', 'ddb33291.htm'),
 ('AA amyloidosis', 'ddb16.htm'),
 ('Cholestasis-oedema syndrome, Norwegian type', 'ddb32129.htm'),
 ('Reticular erythrokeratoderma', 'ddb35259.htm'),
 ('SHORT syndrome', 'ddb30068.htm'),
 ('Aarskog syndrome', 'ddb29329.htm'),
 ('Aarskog syndrome', 'ddb29329.htm'),
 ('Pyridoxine-dependent epilepsy', 'ddb34502.htm'),
 ('Aase syndrome', 'ddb29332.htm'),
 ('Aase syndrome', 'ddb29332.htm'),
 ('Abacavir', 'ddb29945.htm'),
 ('Abaloparatide', 'ddb36626.htm'),
 ('Abametapir', 'ddb61883.htm'),
 ('Abarelix', 'ddb33155.htm'),
 ('Abatacept', 'ddb34101.htm'),
 ('Low phospholipid-associated cholelithiasis', 'ddb65213.htm'),
 ('ABCC8-related permanent neonatal diabetes mellitus', 'ddb34545.htm'),
 ('ABCD syndrome', 'ddb33683.htm'),
 ('Abciximab', 'ddb30046.htm'),
 ('Abdominal distension', 'ddb30819.htm'),
 ('Abdominal distension', 'ddb30819.htm'),
 ('Abdominal mass', 'ddb14326.htm'),
 ('Abdominal pain', 'ddb14367.htm'),
 ('Abdominal distension