In [12]:
from bs4 import BeautifulSoup
import requests
import json
import os

# URL of the main page
with open('south-city.txt','r') as file:
    urls = file.readlines()

for url in urls:
    
        

    # Send a request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all doctor sections
    doctor_sections = [a['href'] for a in soup.find_all('a', class_='gdlr-core-button')]

    doctors = []
    hospital_name = "South City Hospital"

    # Iterate over each doctor section link
    for section_link in doctor_sections:
        # Construct the full URL for the doctor's page
        new_link = 'https://southcityhospital.org' + section_link
        response = requests.get(new_link)
        new_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract information with error handling
        try:
            # Extract doctor name
            name_tag = new_soup.find('p').find('strong')
            if name_tag:
                name = name_tag.text.strip()
            else:
                name = 'Not available'

            # Extract qualifications and position
            qualifications_and_position_tag = new_soup.find_all('p')[0]
            if qualifications_and_position_tag:
                qualifications_and_position = qualifications_and_position_tag.contents[1].strip()
            else:
                qualifications_and_position = 'Not available'

            # Extract clinic details
            clinic_section_tag = new_soup.find_all('p')[1]
            if clinic_section_tag:
                clinic_days = clinic_section_tag.contents[2].strip()
                clinic_timings = clinic_section_tag.contents[8].strip()
            else:
                clinic_days = 'Not available'
                clinic_timings = 'Not available'

            # Extract appointment details
            appointment_section_tag = new_soup.find_all('p')[2]
            if appointment_section_tag:
                appointments = appointment_section_tag.contents[2].strip()
            else:
                appointments = 'Not available'

            # Create the dictionary with the extracted information
            doctor_info = {
                'name': name,
                'speciality': qualifications_and_position,  # Assuming 'speciality' maps to qualifications and position
                'education': qualifications_and_position,  # Add additional parsing if education needs to be separate
                'work_days': clinic_days,
                'timing': clinic_timings,
                'appointments': appointments,
                'hospital': hospital_name
            }
            doctors.append(doctor_info)

        except Exception as e:
            print(f"Error processing link {new_link}: {e}")
            # Print the HTML content if an error occurs
            print(new_soup.prettify())

# Print extracted information
print(doctors)

# Save in JSON format
doctor_info_file = 'outputs/south-city-hospital.json'

if os.path.exists(doctor_info_file):
    with open(doctor_info_file, 'r') as json_file:
        existing_data = json.load(json_file)
else:
    existing_data = []

# Append new data to existing data
existing_data.extend(doctors)

# Write updated data to the JSON file
with open(doctor_info_file, 'w') as json_file:
    json.dump(existing_data, json_file, indent=4)



