In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
request_header = {'User-Agent': 'Mozilla/5.0'}

In [3]:
doctor_name = []
experience = []
consultation_fee = []
ratings = []
stories = []
cities = []
departments = []

for city in ['Jaipur','Hyderabad','Bangalore', 'Mumbai','Delhi','Ranchi','Pune','Kolkata','Indore','Chennai','Lucknow','Noida','Ahmadabad','Chandigarh','Mysore','Nagpur','Gandhinagar','Srinagar','Sikkim','Goa']:
    for dpt in  ['Dentist','gynecologist','Dermatologist','General Physician','Ayurveda','Homoeopath','Neurologist','Pediatrician','pediatric neurologist','physiotherapist','cardiologist','urologist']:
        url = f'https://www.practo.com/search/doctors?results_type=doctor&q=%5B%7B%22word%22%3A%22{dpt}%22%2C%22autocompleted%22%3Atrue%2C%22category%22%3A%22subspeciality%22%7D%5D&city={city}'
        page = requests.get(url,headers = request_header)
        soup = BeautifulSoup(page.text)
        
        for i in soup.find_all("div",class_="info-section"):
            #r = re.findall(r'Dr\. [A-Z][a-zA-Z. ]+(?:Dentist|Gynecologist|Dermatologist|physician|gynecologist|General Physician|Physician)',i.text)
            r= re.findall(r'Dr\.?\s+[A-Z][a-zA-Z.]*\s*(?:[A-Z][a-zA-Z.]*\s*)*',i.text)

            #r = re.findall(r'Dr\. [A-Z][a-zA-Z\.]*?(?: [A-Z][a-zA-Z\.]*){1,}',i.text)
            if r:
                doctor_name.append(r[0])
            else:
                doctor_name.append(np.nan)

       
        
        #experience
        
        for i in soup.find_all("div",class_="info-section"):
            r = re.findall(r'(\d{1,2})\xa0years',i.text)
            if r:
                experience.append(r[0])
            else:
                experience.append(np.nan)
        
        #fee
        
        for i in soup.find_all("div",class_="info-section"):
            r = re.findall(r'â‚¹(\d{1,3})',i.text)
            if r:
                consultation_fee.append(r[0])
            else:
                consultation_fee.append(0)
        
        #ratings
        
        for i in soup.find_all("div",class_="info-section"):
            r = re.findall(r'(\d{1,3})%',i.text)
            if r:
                ratings.append(r[0])
            else:
                ratings.append(0)
        
        #stories
        
        for i in soup.find_all("div",class_="info-section"):
            r = re.findall(r'(\d+)\s*[\xa0\s]*Patient Stories',i.text)
            if r:
                stories.append(r[0])
            else:
                stories.append(0)

        #cities

        for i in soup.find_all("div",class_="info-section"):
            r = re.findall(r',\s*(\w+)\xa0',i.text)
            if r:
                cities.append(r[0])
            else:
                cities.append(np.nan)

        #departments
        
        for i in soup.find_all("div",class_="info-section"):
            r = re.findall(r'(Dentist|General Physician|gynecologist|Dermatologist|Gynecologist|Homoeopath|Ayurveda|Neurologist|Pediatric Neurologist|Physiotherapist|Cardiologist|Urologist|Pediatrician| in [A-Za-z ]+)',i.text)
            if r:
                departments.append(r[0])
            else:
                departments.append(np.nan)       

In [17]:
print (len(doctor_name))
print (len(experience))
print (len(consultation_fee))
print (len(ratings))
print (len(stories))
print (len(cities))
print (len(departments))

1813
1813
1813
1813
1813
1813
1813


In [18]:
df = pd.DataFrame({'Name':doctor_name,'Experience':experience,'Consultation Fee':consultation_fee,'Ratings':ratings,'Patient Stories':stories})

In [19]:
df['City'] = cities
df['Department'] = departments

In [20]:
# dropped null from multiple columns
df.dropna(subset=['Name', 'Experience', 'City'], inplace=True)

In [21]:
# checking null in multiple columns
df[['Name', 'Experience', 'City']].isnull().sum()

Name          0
Experience    0
City          0
dtype: int64

In [22]:
df.isnull().sum()

Name                0
Experience          0
Consultation Fee    0
Ratings             0
Patient Stories     0
City                0
Department          0
dtype: int64

In [23]:
# removing department name from doctor's name
df['Name'] = df['Name'].str.replace(r'(Dentist|General Physician|gynecologist|Dermatologist|Gynecologist|Homoeopath|Ayurveda|Neurologist|Pediatric Neurologist|Physiotherapist|Cardiologist|Urologist)$', '', regex=True).str.strip()

In [24]:
#datatype
df['Name'] = df['Name'].astype('string')
df['Experience'] = df['Experience'].astype('int')
df['Consultation Fee'] =df['Consultation Fee'].astype('int')
df['Ratings'] = df['Ratings'].astype('int')
df['Patient Stories'] = df['Patient Stories'].astype('int')
df['City'] = df['City'].astype('string')
df['Department'] = df['Department'].astype('string')

In [25]:
df = df.reset_index(drop=True)

In [26]:
df.head(15)

Unnamed: 0,Name,Experience,Consultation Fee,Ratings,Patient Stories,City,Department
0,Dr. Ruby Ladha,18,200,98,389,Jaipur,Dentist
1,Dr. Yojna Shriwas,19,300,95,150,Jaipur,Dentist
2,Dr. Prachi Mital,16,300,100,23,Jaipur,Dentist
3,Dr. Shashank Gupta,13,400,98,368,Jaipur,Dentist
4,Dr. Amit Sharma,20,250,100,15,Jaipur,Dentist
5,Dr. Mishthu Solanki,22,400,99,483,Jaipur,Dentist
6,Dr. Akshay Garg,20,400,97,195,Jaipur,Dentist
7,Dr. Shrestha Singhania,13,300,100,32,Jaipur,Dentist
8,Dr. Pooja Goyal,19,300,99,188,Jaipur,Dentist
9,Dr. Anushri Ranjan,15,200,100,15,Jaipur,Dentist


In [27]:
# To check null value for a particular column
df[df['Department'].isnull()]

Unnamed: 0,Name,Experience,Consultation Fee,Ratings,Patient Stories,City,Department


In [28]:
df.to_excel('Practo.xlsx',index = False)

In [30]:
df.to_csv('Practo.csv',index = False)