1. Specialities 

In [41]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

# Homepage URL
base_url = "https://www.pristyncare.com/"

# Request the homepage
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")

# We'll store only the treatment names, not the links
specialities = []

# Loop through all the anchor tags to find treatment categories
for a_tag in soup.find_all("a", href=True):
    href = a_tag['href']
    text = a_tag.get_text(strip=True)

    # We're only interested in URLs that include '/c/' (which is how categories are linked)
    if "/c/" in href:
        if text:  # Just in case the link has no visible text
            specialities.append([text])  # We store it as a list to match CSV row format

# Save the extracted specialties into a CSV file
with open("specialities.csv", "w", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Speciality"])  # CSV header
    writer.writerows(specialities)

print(f"Saved {len(specialities)} specialities to 'specialities.csv'")


Saved 24 specialities to 'specialities.csv'


2. Treatments

In [47]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

# Function to get treatment names and URLs from a given category page
def get_treatments_from_category(category_url):
    try:
        response = requests.get(category_url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        treatments = {}

        # Check all links for treatments
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            text = a_tag.get_text(strip=True)

            if '/treatment/' in href and text:
                full_url = urljoin(category_url, href)
                treatments[text] = full_url  # dictionary to avoid duplicates

        return treatments

    except Exception as e:
        print(f"Failed to fetch from {category_url}: {e}")
        return {}

# Replace with desired category info
category_name = "Proctology"
category_url = "https://www.pristyncare.com/c/proctology/"

print(f"Scraping treatments under: {category_name}")
treatment_dict = get_treatments_from_category(category_url)

# Save to CSV
with open("treatments_in_category.csv", "w", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Treatment", "URL"])
    for treatment, url in treatment_dict.items():
        writer.writerow([treatment, url])

print(f"Saved {len(treatment_dict)} unique treatments to 'treatments_in_category.csv'")


Scraping treatments under: Proctology
Saved 138 unique treatments to 'treatments_in_category.csv'


In [49]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin

# Load treatment URLs from the CSV
treatments = []

with open("treatments_in_category.csv", newline='', encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        treatment = row["Treatment"]
        url = row["URL"]
        treatments.append((treatment, url))

# Function to extract doctor names and URLs from a treatment page
def get_doctors_from_treatment(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        doctors = []

        for tag in soup.select("a[href*='/specialist/']"):
            name = tag.get_text(strip=True)
            href = tag['href']
            full_url = urljoin(url, href)

            if name and name.startswith("Dr."):
                doctors.append((name, full_url))

        return list(set(doctors))  # remove duplicates

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []

# Go through each treatment and extract doctor names and URLs
output_rows = []

for treatment, url in treatments:
    print(f"🔍 Scraping doctors for: {treatment}")
    doctor_list = get_doctors_from_treatment(url)
    for name, doc_url in doctor_list:
        output_rows.append({
            "Treatment": treatment,
            "Doctor Name": name,
            "Doctor URL": doc_url
        })

# Save to CSV
with open("doctors_by_treatment.csv", "w", newline='', encoding="utf-8") as out_file:
    writer = csv.DictWriter(out_file, fieldnames=["Treatment", "Doctor Name", "Doctor URL"])
    writer.writeheader()
    writer.writerows(output_rows)

print(f"\n✅ Done! Saved results to 'doctors_by_treatment.csv'")


🔍 Scraping doctors for: Piles Treatment
🔍 Scraping doctors for: Fistula Treatment
🔍 Scraping doctors for: Fissure Treatment
🔍 Scraping doctors for: Pilonidal Sinus Treatment
🔍 Scraping doctors for: Rectal Prolapse
🔍 Scraping doctors for: Hernia Surgery
🔍 Scraping doctors for: Gallstones Treatment
🔍 Scraping doctors for: Appendicitis
🔍 Scraping doctors for: Inguinal Hernia Treatment
🔍 Scraping doctors for: Umbilical Hernia Treatment
🔍 Scraping doctors for: Surgical Abortion
🔍 Scraping doctors for: MTP
🔍 Scraping doctors for: Ectopic Pregnancy Treatment
🔍 Scraping doctors for: Molar Pregnancy Treatment
🔍 Scraping doctors for: Uterus Removal
🔍 Scraping doctors for: Ovarian Cyst
🔍 Scraping doctors for: Miscarriage Treatment
🔍 Scraping doctors for: Bartholin Cyst Treatment
🔍 Scraping doctors for: Endometriosis Treatment
🔍 Scraping doctors for: Adenomyosis Treatment
🔍 Scraping doctors for: PCOS-PCOD Treatment
🔍 Scraping doctors for: Pregnancy Care
🔍 Scraping doctors for: Laser Vaginal Tighte

In [51]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

# Load the CSV with treatment and doctor URL
df = pd.read_csv("doctors_by_treatment.csv")

# Drop exact duplicate URLs
df = df.drop_duplicates(subset=["Doctor URL"])

# Function to scrape doctor profile
def scrape_doctor_profile(doctor_url):
    try:
        response = requests.get(doctor_url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {"url": doctor_url}

        # 1. Name
        h1 = soup.find("h1")
        data["name"] = h1.get_text(strip=True) if h1 else None

        # 2. About
        about_heading = soup.find("h3", class_="doctorDetailsHeading", string=lambda x: x and "About" in x)
        if about_heading:
            para = about_heading.find_next("p")
            about_text = para.get_text(separator=" ", strip=True) if para else None
            data["about"] = about_text.replace("Read More", "").strip() if about_text else None
        else:
            data["about"] = None

        # 3. Medical Registration
        reg_heading = soup.find("h3", string=lambda x: x and "Medical Registration" in x)
        if reg_heading:
            reg_div = reg_heading.find_next("div", class_="registrationDetailsContainer")
            reg_text = reg_div.get_text(strip=True) if reg_div else None
            reg_fixed = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", reg_text) if reg_text else None
            data["registration"] = reg_fixed
        else:
            data["registration"] = None

        # 4. Education
        education = []
        edu_ul = soup.find("ul", class_="doctorDetailsDataContainer")
        if edu_ul:
            for li in edu_ul.find_all("li"):
                span = li.find("span")
                if span:
                    education.append(span.get_text(strip=True))
        data["education"] = ", ".join(education)

        # 5. Treatments
        treatments = []
        treat_ul = soup.find("ul", class_="doctorInterlinkingDetails")
        if treat_ul:
            for li in treat_ul.find_all("li"):
                a = li.find("a")
                if a:
                    treatments.append(a.get_text(strip=True))
        data["treatments"] = ", ".join(treatments)

        return data

    except Exception as e:
        return {"url": doctor_url, "name": None, "about": None, "registration": None, "education": "", "treatments": "", "error": str(e)}

# Set to track unique doctor names
scraped_names = set()
scraped_urls = set()
scraped_profiles = []

for url in df["Doctor URL"]:
    if pd.isna(url):
        continue

    # Early skip if already scraped by URL
    if url in scraped_urls:
        continue

    print(f"Scraping: {url}")
    profile = scrape_doctor_profile(url)

    # Avoid re-scraping based on name as well
    if profile["name"] and profile["name"] not in scraped_names:
        scraped_profiles.append(profile)
        scraped_names.add(profile["name"])
        scraped_urls.add(url)
        time.sleep(1.5)  # Avoid hitting the server too fast
    else:
        print(f"Skipping duplicate doctor: {profile.get('name')}")

# Save final results
final_df = pd.DataFrame(scraped_profiles)
final_df.to_csv("unique_scraped_doctor_profiles.csv", index=False, encoding='utf-8')
print("Saved to unique_scraped_doctor_profiles.csv")


Scraping: https://www.pristyncare.com/specialist/dr-a-n-m-owais-danish-1avgsfk96e/
Scraping: https://www.pristyncare.com/specialist/dr-pravat-kumar-majumdar-vx6ahe6uav/?disease=Piles
Scraping: https://www.pristyncare.com/specialist/dr-sunil-gehlot-rcx3qjqfjw/?disease=Piles
Scraping: https://www.pristyncare.com/specialist/dr-dhamodhara-kumar-c-b-0ly84yrity/?disease=Piles
Scraping: https://www.pristyncare.com/specialist/dr-piyush-gulabrao-nikam-uympcwmsfn/
Scraping: https://www.pristyncare.com/specialist/dr-sunil-gehlot-rcx3qjqfjw/
Skipping duplicate doctor: Dr. Sunil Gehlot
Scraping: https://www.pristyncare.com/specialist/dr-amol-gosavi-y3amsnwuyd/?disease=Piles
Scraping: https://www.pristyncare.com/specialist/dr-pankaj-sareen-5njangbrma/
Scraping: https://www.pristyncare.com/specialist/dr-piyush-sharma-uk7nuj8gni/
Scraping: https://www.pristyncare.com/specialist/dr-amol-gosavi-y3amsnwuyd/?disease=Anal Fistula
Skipping duplicate doctor: Dr. Amol Gosavi
Scraping: https://www.pristyncare.