In [2]:
import requests
from bs4 import BeautifulSoup
import json

# Define the URL of the page to scrape
url = "https://pih.com.pk/doctors"

try:
    response = requests.get(url)
    response.raise_for_status()  # Check for HTTP errors
    html = response.text
except requests.RequestException as e:
    print(f"Error fetching the page: {e}")
    exit()

soup = BeautifulSoup(html, 'html.parser')

doctors = []
seen_doctors = set()  # To track and remove duplicates

for element in soup.find_all(class_='elementor-widget-wrap'):
    # Extract name
    name_tag = element.find('h3')
    if name_tag:
        name = name_tag.text.strip()

        # Skip if the doctor has already been added
        if name in seen_doctors:
            continue
        seen_doctors.add(name)

        # Extract education and position
        text_editor = element.find(class_='elementor-widget-text-editor')
        if text_editor:
            lines = text_editor.text.strip().split('\n')
            education = lines[0].strip()
            position = lines[1].strip() if len(lines) > 1 else "N/A"

            # Extract days and timings
            clinic_info = lines[2:] if len(lines) > 2 else []
            days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]

            doctor = {
                "name": name,
                "education": education,
                "hospital": "Pak International Hospital",  # Assuming this is static; update if dynamic
                "position": position,
                "clinic": {
                    "days": days,
                }
            }

            doctors.append(doctor)

path = "outputs/pak-international-hospital.json"
print(json.dumps(doctors, indent=2))

with open(path, 'w') as json_file:
    json.dump(doctors, json_file, indent=4)

[
  {
    "name": "Prof. Dr. M. Iqbal Afridi",
    "education": "MBBS, MCPS PSYCH, FCPS PSYCH (PAK),FACP (USA), FRCP (Ireland),Consultant Neuro-psychiatrist & AddictionologistMonday to Saturday 6:00 Pm to 10:00 Pm",
    "hospital": "Pak International Hospital",
    "position": "N/A",
    "clinic": {
      "days": [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday"
      ]
    }
  },
  {
    "name": "Rozeena Dharwarwala",
    "education": "Clinical Psychologist & HypnotherapistM.Phil (Clinical Psychology) ICD, KU.Forensic Mental Health Training SOFMH, NHS Scotland.Monday to Saturday \u2013 4 Pm to 10 Pm",
    "hospital": "Pak International Hospital",
    "position": "N/A",
    "clinic": {
      "days": [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday"
      ]
    }
  },
  {
    "name": "Dr. Abdul Majid",
    "education": "MBBS, FCPS (Medicine)Consultant Phy

In [19]:
import requests
from bs4 import BeautifulSoup
import json

# Define the URL of the page to scrape
url = "https://pih.com.pk/doctors"

try:
    response = requests.get(url)
    response.raise_for_status()  # Check for HTTP errors
    html = response.text
except requests.RequestException as e:
    print(f"Error fetching the page: {e}")
    exit()

# soup = BeautifulSoup(html, 'html.parser')

soup = BeautifulSoup(doctors_needed, 'html.parser')

doctors = []
seen_doctors = set()  # To track and remove duplicates

for element in soup.find_all(class_='elementor-widget-wrap'):
    # Extract name
    name_tag = element.find('h3')
    if name_tag:
        name = name_tag.text.strip()

        # Skip if the doctor has already been added
        if name in seen_doctors:
            continue
        seen_doctors.add(name)

        # Extract education and position
        text_editor = element.find(class_='elementor-widget-text-editor')
        if text_editor:
            paragraphs = text_editor.find_all('p')
            if len(paragraphs) >= 2:
                education = paragraphs[0].text.strip()
                position = paragraphs[1].text.strip()
            elif len(paragraphs) == 1:
                education = paragraphs[0].text.strip()
                position = "N/A"
            else:
                education = "N/A"
                position = "N/A"

            # Extract clinic information
            clinic_info = paragraphs[1].text.strip() if len(paragraphs) > 1 else ""
            days = []
            timings = []

            # Define possible days
            day_mapping = {
                'Mon': 'Monday',
                'Tue': 'Tuesday',
                'Wed': 'Wednesday',
                'Thu': 'Thursday',
                'Fri': 'Friday',
                'Sat': 'Saturday',
                'Sun': 'Sunday'
            }

            for short_day, full_day in day_mapping.items():
                if short_day in clinic_info:
                    days.append(full_day)

            # Extract timing from clinic_info
            timing_text = ""
            for line in clinic_info.split('\n'):
                if 'Pm' in line or 'Am' in line:
                    timing_text = line.strip()

            timings.append({
                'time': timing_text,
                'days': ", ".join(days)
            })

            doctor = {
                "name": name,
                "education": education,
                "hospital": "Pak International Hospital",  # Assuming this is static; update if dynamic
                "position": position,
                "clinic": {
                    "days": days,
                    "timings": timings
                }
            }

            doctors.append(doctor)

print(json.dumps(doctors, indent=2))

path = "outputs/pak-international-hospital.json"
with open(path, 'w') as json_file:
    json.dump(doctors, json_file, indent=4)


[
  {
    "name": "Rozeena Dharwarwala",
    "education": "Clinical Psychologist & HypnotherapistM.Phil (Clinical Psychology) ICD, KU.Forensic Mental Health Training SOFMH, NHS Scotland.Monday to Saturday \u2013 4 Pm to 10 Pm",
    "hospital": "Pak International Hospital",
    "position": "N/A",
    "clinic": {
      "days": [],
      "timings": [
        {
          "time": "",
          "days": ""
        }
      ]
    }
  },
  {
    "name": "Prof. Dr. M. Iqbal Afridi",
    "education": "MBBS, MCPS PSYCH, FCPS PSYCH (PAK),FACP (USA), FRCP (Ireland),Consultant Neuro-psychiatrist & AddictionologistMonday to Saturday 6:00 Pm to 10:00 Pm",
    "hospital": "Pak International Hospital",
    "position": "N/A",
    "clinic": {
      "days": [],
      "timings": [
        {
          "time": "",
          "days": ""
        }
      ]
    }
  },
  {
    "name": "Dr. Aurangzaib Abbasi",
    "education": "N/A",
    "hospital": "Pak International Hospital",
    "position": "N/A",
    "clinic": {


In [15]:
doctors_needed ="""<div class="elementor-widget-wrap elementor-element-populated">
								<div class="elementor-element elementor-element-1eb45de elementor-widget elementor-widget-image" data-id="1eb45de" data-element_type="widget" data-widget_type="image.default">
				<div class="elementor-widget-container">
															<img decoding="async" width="450" height="470" src="https://pih.com.pk/wp-content/uploads/2020/09/rozeena-dharwarwala.jpg" class="attachment-large size-large wp-image-497" alt="" loading="lazy" srcset="https://pih.com.pk/wp-content/uploads/2020/09/rozeena-dharwarwala.jpg 450w, https://pih.com.pk/wp-content/uploads/2020/09/rozeena-dharwarwala-287x300.jpg 287w" sizes="(max-width: 450px) 100vw, 450px">															</div>
				</div>
				<div class="elementor-element elementor-element-5ef6f035 elementor-widget elementor-widget-heading" data-id="5ef6f035" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h6 class="elementor-heading-title elementor-size-default">PSYCHOLOGIST</h6>		</div>
				</div>
				<div class="elementor-element elementor-element-7e900985 elementor-widget elementor-widget-heading" data-id="7e900985" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h3 class="elementor-heading-title elementor-size-default">Rozeena Dharwarwala</h3>		</div>
				</div>
				<div class="elementor-element elementor-element-24e02816 elementor-widget elementor-widget-text-editor" data-id="24e02816" data-element_type="widget" data-widget_type="text-editor.default">
				<div class="elementor-widget-container">
							<p>Clinical Psychologist &amp; Hypnotherapist<br>M.Phil (Clinical Psychology) ICD, KU.<br>Forensic Mental Health Training SOFMH, NHS Scotland.<br>Monday to Saturday – 4 Pm to 10 Pm</p>						</div>
				</div>
					</div>
                    <div class="elementor-widget-wrap elementor-element-populated">
								<div class="elementor-element elementor-element-1ecf534 elementor-widget elementor-widget-image" data-id="1ecf534" data-element_type="widget" data-widget_type="image.default">
				<div class="elementor-widget-container">
															<img decoding="async" width="450" height="470" src="https://pih.com.pk/wp-content/uploads/2020/09/dr-iqbal-afridi.jpg" class="attachment-large size-large wp-image-494" alt="" loading="lazy" srcset="https://pih.com.pk/wp-content/uploads/2020/09/dr-iqbal-afridi.jpg 450w, https://pih.com.pk/wp-content/uploads/2020/09/dr-iqbal-afridi-287x300.jpg 287w" sizes="(max-width: 450px) 100vw, 450px">															</div>
				</div>
				<div class="elementor-element elementor-element-68aae96e elementor-widget elementor-widget-heading" data-id="68aae96e" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h6 class="elementor-heading-title elementor-size-default">PSYCHIATRIST</h6>		</div>
				</div>
				<div class="elementor-element elementor-element-66f3e40e elementor-widget elementor-widget-heading" data-id="66f3e40e" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h3 class="elementor-heading-title elementor-size-default">Prof. Dr. M. Iqbal Afridi</h3>		</div>
				</div>
				<div class="elementor-element elementor-element-7588c1e4 elementor-widget elementor-widget-text-editor" data-id="7588c1e4" data-element_type="widget" data-widget_type="text-editor.default">
				<div class="elementor-widget-container">
							<p>MBBS, MCPS PSYCH, FCPS PSYCH (PAK),<br>FACP (USA), FRCP (Ireland),<br>Consultant Neuro-psychiatrist &amp; Addictionologist<br>Monday to Saturday 6:00 Pm to 10:00 Pm</p>						</div>
				</div>
					</div><div class="elementor-widget-wrap elementor-element-populated">
								<div class="elementor-element elementor-element-576933c elementor-widget elementor-widget-image" data-id="576933c" data-element_type="widget" data-widget_type="image.default">
				<div class="elementor-widget-container">
															<img decoding="async" width="450" height="470" src="https://pih.com.pk/wp-content/uploads/2023/01/Dr.Auranzeb.jpg" class="attachment-large size-large wp-image-1228" alt="" loading="lazy" srcset="https://pih.com.pk/wp-content/uploads/2023/01/Dr.Auranzeb.jpg 450w, https://pih.com.pk/wp-content/uploads/2023/01/Dr.Auranzeb-287x300.jpg 287w" sizes="(max-width: 450px) 100vw, 450px">															</div>
				</div>
				<div class="elementor-element elementor-element-c93c89b elementor-widget elementor-widget-heading" data-id="c93c89b" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h6 class="elementor-heading-title elementor-size-default">Neurosurgeon</h6>		</div>
				</div>
				<div class="elementor-element elementor-element-a043c14 elementor-widget elementor-widget-heading" data-id="a043c14" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h3 class="elementor-heading-title elementor-size-default">Dr. Aurangzaib Abbasi</h3>		</div>
				</div>
				<div class="elementor-element elementor-element-b74ea76 elementor-widget elementor-widget-text-editor" data-id="b74ea76" data-element_type="widget" data-widget_type="text-editor.default">
				<div class="elementor-widget-container">
							MBBS, FCPS<br>
Consultant Neurosurgeon<br>
Mon, Wed, Fri – 8:00 Pm to 10:00 Pm						</div>
				</div>
					</div><div class="elementor-widget-wrap elementor-element-populated">
								<div class="elementor-element elementor-element-82e5ffb elementor-widget elementor-widget-image" data-id="82e5ffb" data-element_type="widget" data-widget_type="image.default">
				<div class="elementor-widget-container">
															<img decoding="async" width="450" height="470" src="https://pih.com.pk/wp-content/uploads/2020/09/dr-list-ph-male.jpg" class="attachment-large size-large wp-image-511" alt="" loading="lazy" srcset="https://pih.com.pk/wp-content/uploads/2020/09/dr-list-ph-male.jpg 450w, https://pih.com.pk/wp-content/uploads/2020/09/dr-list-ph-male-287x300.jpg 287w" sizes="(max-width: 450px) 100vw, 450px">															</div>
				</div>
				<div class="elementor-element elementor-element-6ae2d1c elementor-widget elementor-widget-heading" data-id="6ae2d1c" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h6 class="elementor-heading-title elementor-size-default">Pediatrician</h6>		</div>
				</div>
				<div class="elementor-element elementor-element-2f5c1d3 elementor-widget elementor-widget-heading" data-id="2f5c1d3" data-element_type="widget" data-widget_type="heading.default">
				<div class="elementor-widget-container">
			<h3 class="elementor-heading-title elementor-size-default">Dr. Humera Shaikh​</h3>		</div>
				</div>
				<div class="elementor-element elementor-element-a1bd7ec elementor-widget elementor-widget-text-editor" data-id="a1bd7ec" data-element_type="widget" data-widget_type="text-editor.default">
				<div class="elementor-widget-container">
							<p>MBBS, MCPS<br>Consultant Pediatrician<br>Mon to Sat – 02:00 Pm to 04:00 Pm</p>						</div>
				</div>
					</div>"""