In [9]:
import requests
from bs4 import BeautifulSoup
import re 
import os 
import json 


def extract_data(url ):
    # Request the main page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the doctor articles
    doctors = soup.find_all('article', class_='docs-all')

    # Initialize a list to store all doctor data
    doctors_data = []

    for doctor in doctors:
        # Extract basic information
        name = doctor.find('h2').text.strip()
        position = doctor.find('h3').text.strip()
        qualifications = doctor.find('p', class_='qualification').text.strip()
        
        # Extract the doctor's detail page URL
        detail_page_url = doctor.find('a')['href']
        
        # Request the doctor's detail page
        detail_response = requests.get(detail_page_url)
        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
        
        # Extract timing information from the detail page
        timing_div = detail_soup.find('div', class_='wpb_text_column')
        timing_text = timing_div.get_text(separator="\n").strip() if timing_div else ""

        # Initialize an empty dictionary for the timings
        timings = {}

        # Process the timing information
        for line in timing_text.split("\n"):
            # Match patterns like "Wednesday & Saturday : 04:00 PM To 06:00 PM"
            day_match = re.match(r"(\w+)\s*&amp;\s*(\w+)\s*:\s*(.+)", line)
            if day_match:
                days = [day_match.group(1).strip(), day_match.group(2).strip()]
                timing = day_match.group(3).strip()
                for day in days:
                    timings[day] = timing
            else:
                # Match patterns like "Wednesday : 04:00 PM To 06:00 PM"
                day_time = re.split(r"\s*:\s*", line, maxsplit=1)
                if len(day_time) == 2:
                    day, time = day_time
                    days = [d.strip() for d in re.split(r"\s*&amp;\s*|\s*,\s*|\s*and\s*", day)]
                    for day in days:
                        timings[day] = time.strip()
        # Create a dictionary for the doctor's information
        doctor_data = {
            "name": name,
            "image ": "",
            "position": position,
            "hospital": "Burhani Hospital",
            "qualifications": qualifications,
            "timings": timings,
            
        }
        
        # Add the doctor's information to the list
        doctors_data.append(doctor_data) 
    return doctors_data
# if os.path.exists("outputs/burhani-hospital.json"):
#     with open("doctors_info.json", 'r') as json_file:
#         existing_data = json.load(json_file)
# else:
#     existing_data = []

# # Append new data to existing data
# existing_data.extend(doctors_data)

# Write updated data to the JSON file
with open("burhani-hospital.txt", "r") as fs:
	urls = fs.readlines()

doctors = []

for url in urls:
    doctors.extend(extract_data(url))


with open("outputs/burhani-hospital.json", 'w') as json_file:
    # json.dump(existing_data, json_file, indent=4)
    json.dump(doctors,json_file,indent=4)
    print(doctors)





[{'name': 'DR. SALEEM PATEL', 'image ': '', 'position': 'Child Specialist', 'hospital': 'Burhani Hospital', 'qualifications': 'MBBS , MCPS , FCPS (Pediatrics)', 'timings': {'Tuesday & Friday': '05:30 PM To 06:30 PM', 'Sunday': '1:00 PM To 3:00 PM'}}, {'name': 'DR. HAJI SALEEM', 'image ': '', 'position': 'Child Specialist', 'hospital': 'Burhani Hospital', 'qualifications': 'MBBS , MCPS (Pediatrics)', 'timings': {'Monday': '12:00 PM To 2:00 PM', 'Wednesday & Friday': '12:00 PM To 2:00 PM'}}, {'name': 'DR. VED VASWANI', 'image ': '', 'position': 'Child Specialist', 'hospital': 'Burhani Hospital', 'qualifications': 'MBBS , MCPS (Pediatrics)', 'timings': {'Monday To Saturday': '08:00 PM To 10:00 PM', 'Sunday': '12:00 PM To 01:00 PM'}}, {'name': 'DR. BALRAM DAS', 'image ': '', 'position': 'Child Specialist', 'hospital': 'Burhani Hospital', 'qualifications': 'MBBS , Diploma in Child Health (Pediatrics)', 'timings': {'Tuesday': '07:00 PM To 08:00 PM', 'Thursday & Saturday': '07:00 PM To 08:00 