In [None]:
import requests
from bs4 import BeautifulSoup

# Function to extract doctor information from the main page
def extract_doctor_info(doctor_div):
    # Extract doctor image
    img_tag = doctor_div.find('img')
    image_url = img_tag['src'] if img_tag else None

    # Extract doctor name
    name_tag = doctor_div.find('h4', class_='truncate-text')
    name = name_tag.get_text().strip() if name_tag else None

    # Extract doctor specialty
    specialty_tag = doctor_div.find('em', class_='d-block g-font-style-normal g-font-size-10 text-uppercase g-color-primary')
    specialty = specialty_tag.get_text().strip() if specialty_tag else None

    # Extract profile URL
    profile_link_tag = doctor_div.find('a', class_='btn btn-xs btn-xs-profil u-btn-primary g-mr-10 g-mb-1')
    profile_url = profile_link_tag['href'] if profile_link_tag else None

    return {
        'name': name,
        'image': image_url,
        'specialty': specialty,
        'profile_url': profile_url
    }

# Function to extract additional information from the profile page
def extract_profile_info(profile_url):
    profile_info = {}

    response = requests.get(profile_url)
    if response.status_code == 200:
        profile_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract additional details from the profile page
        big_img_tag = profile_soup.find('img', class_='Profile-big-img')
        profile_info['profile_image'] = big_img_tag['src'] if big_img_tag else None

        # Add other details as needed from the profile page
        # For example, extract the phone number, appointment button details, etc.

    return profile_info

# Main function to scrape the doctors list
def scrape_doctors_list(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the main div containing all doctors
        doctors_list_div = soup.find('div', class_='col-md-12 col-xs-12 col-sm-12 myitemcontainershortcode-html')
        
        # Iterate over each doctor's div
        doctors_info = []
        for doctor_div in doctors_list_div.find_all('div', class_='u-info-v1-2'):
            doctor_info = extract_doctor_info(doctor_div)
            if doctor_info['profile_url']:
                profile_info = extract_profile_info(doctor_info['profile_url'])
                doctor_info.update(profile_info)
            doctors_info.append(doctor_info)
            print(doctors_info)
        
        return doctors_info
    else:
        print(f"Failed to retrieve the page: Status code {response.status_code}")
        return None

# Example usage
url = 'https://hospitals.aku.edu/pakistan/patientservices/pages/findadoctor.aspx?Spec=Paediatric%20Occupational%20Therapy'
doctors_info = scrape_doctors_list(url)

# # Print the scraped information
# for doctor in doctors_info:
#     print(doctor)


[{'name': 'Aisha Khan', 'image': 'https://hospitals.aku.edu/pakistan/patientservices/Lists/Faculty/Attachments/228/Aisha.jpg', 'specialty': 'Paediatric Occupational Therapy', 'profile_url': 'https://hospitals.aku.edu/pakistan/patientservices/pages/profiles.aspx?ProfileID=228&Name=Aisha Khan&page=findadoctor', 'profile_image': 'https://hospitals.aku.edu/pakistan/patientservices/Lists/Faculty/Attachments/228/Aisha.jpg'}]


In [12]:
import requests
from bs4 import BeautifulSoup
import re
import json

def extract_profile_data(detail_soup):
    # Extract name and qualifications from the profile page
    name = detail_soup.find('span', id=re.compile(r'lblname')).text.strip()
    qualifications = detail_soup.find('span', id=re.compile(r'qualification')).text.strip()

   # appointmnet no 
    appointment = detail_soup.find('button',class_ = "profile-btn-phone").text.strip()

    return name, qualifications , appointment

def extract_data(url):
    # Request the main page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the doctor divs
    doctors = soup.find_all('div', class_='u-info-v1-2')

    # Initialize a list to store all doctor data
    doctors_data = []

    for doctor in doctors:
        # Extract basic information from the main page
        name = doctor.find('h4', class_='truncate-text').text.strip()
        position = doctor.find('em', class_='g-font-size-10').text.strip()
        image = doctor.find('img')['src'].strip()

        # Extract the doctor's detail page URL
        detail_page_url = doctor.find('a', href=True)['href']
        
        # Request the doctor's detail page
        detail_response = requests.get(detail_page_url)
        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
        
        # Extract additional data from the profile page
        name , qualifications , appointment  = extract_profile_data(detail_soup)


        # Create a dictionary for the doctor's information
        doctor_data = {
            "name": name,
            "image": image,
            "position": position,
            "hospital": "Agha Khan University Hospital",
            "qualifications": qualifications,
            "appointment": appointment
        }

        # Add the doctor's information to the list
        doctors_data.append(doctor_data)
    
    return doctors_data

with open("aga-khan-uni-hospital.txt", "r") as fs:
    urls = fs.readlines()

doctors = []

for url in urls:
    doctors.extend(extract_data(url.strip()))

with open("outputs/aga-khan-hospital.json", 'w') as json_file:
    json.dump(doctors, json_file, indent=4)
    print(doctors)


[{'name': 'Aisha Khan', 'image': 'https://hospitals.aku.edu/pakistan/patientservices/Lists/Faculty/Attachments/228/Aisha.jpg', 'position': 'Paediatric Occupational Therapy', 'hospital': 'Agha Khan University Hospital', 'qualifications': 'Bachelors in Occupational Therapy', 'appointment': '021-111-911-911'}, {'name': 'Asim Fakhruddin Belgaumi', 'image': 'https://hospitals.aku.edu/pakistan/patientservices/Lists/Faculty/Attachments/176/22.%20AFBA.jpg', 'position': 'Paediatric Oncology', 'hospital': 'Agha Khan University Hospital', 'qualifications': 'MBBS, American Board of Paediatrics, American Board of Paediatric Haematology/Oncology', 'appointment': '021-111-911-911'}, {'name': 'Farrah Bashir', 'image': 'https://hospitals.aku.edu/pakistan/patientservices/Lists/Faculty/Attachments/186/21.%20FARB.jpg', 'position': 'Paediatric Oncology', 'hospital': 'Agha Khan University Hospital', 'qualifications': 'MBBS, BSc, FCPS Paediatric Medicine, FCPS Paediatric Haematology & Oncology, Fellowship Pa