Medication reviews extracted from webmd for diabetes type 2 mellitus.

Source code applied to extract reviews from web.MD: 
'https://github.com/sepidehparhami/scraping-webmd-drug-reviews/tree/main'. 


In [11]:
# imports
import requests
import numpy as np
import pandas as pd
import regex as re
#Functions
# regular expressions for parsing data from a single review
# elements found in review-details div
def regex_date(review):
    '''Parses the date of the review in format dd/mm/yyyy'''
    return re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]

def regex_condition(review):
    '''Parses the condition for which the medication is used'''
    condition_element = review.find('strong', class_='condition')
    condition_listed = condition_element is not None
    if condition_listed:
        # TODO: be able to match ''"Change of Life" Signs' condition
        condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', condition_element.text)
    return condition_match[0] if (condition_listed and len(condition_match) > 0) else np.nan

def regex_rating_overall(review):
    '''Parses the overall rating, the average of 3 categories'''
    rating_overall_line = review.find('div', class_='overall-rating').strong.text
    return re.findall(r'\d+.\d+', rating_overall_line)

def regex_rating_category(review, ind_cat):
    '''Parses the rating for the category at index ind_cat in ['effectiveness', 'ease_of_use', 'satisfaction']'''
    rating_categories = review.find('div', class_='categories').find_all('section')
    div = rating_categories[ind_cat].find('div', class_='webmd-rate on-mobile')
    return int(div.get('aria-valuenow'))

def regex_text(review):
    '''Parses the free response text review for the drug'''
    text_line = review.find('p', class_='description-text')
    return text_line.text if text_line is not None else np.nan
# regular expressions for parsing data from a single review
# elements found in details div
def regex_age(details):
    '''Parses the age of the medication user'''
    age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
    return age_match[0] if len(age_match) > 0 else np.nan

def regex_gender(details):
    '''Parses the gender of the medication user'''
    gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
    return gender_match[0] if len(gender_match) > 0 else np.nan

def regex_time(details):
    '''Parses the duration of time on drug'''
    time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
    return time_match[0] if len(time_match) > 0 else np.nan

def regex_reviewer(details):
    '''Parses the type of reviewer'''
    reviewer_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s*$)', details)
    return reviewer_match[0] if len(reviewer_match) > 0 else np.nan
# parse the reviews on a single webpage
def parse_reviews_page(soup, reviews_df):
    '''Populates reviews_df data frame with records from 1 page's reviews
    
    Parameters:
    reviews_html (str): HTML for the webpage extracted using BeautifulSoup
    drug_name (str): the name of the drug being reviewed
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    reviews_html = soup.find_all('div', class_='review-details') # get elements that hold each review
    drug_name = re.findall(r'(.*)(?=\sReviews)', soup.title.text)[0] # page title is drug name
    
    # loop over reviews from a single page
    for i, review in enumerate(reviews_html):
        to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])
        
        details = review.find('div', class_='details').text

        to_append['drug_name'] = drug_name
        to_append['date'] = regex_date(review)
        to_append['age'] = regex_age(details)
        to_append['gender'] = regex_gender(details)
        to_append['time_on_drug'] = regex_time(details)
        to_append['reviewer_type'] = regex_reviewer(details)
        to_append['condition'] = regex_condition(review)
        to_append['rating_overall'] = regex_rating_overall(review)
        
        for ind_cat, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
            to_append[f'rating_{cat}'] = regex_rating_category(review, ind_cat)
    
        to_append['text'] = regex_text(review)
        reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)
        
    return reviews_df
def get_soup(review_url, page):
    curr_url = review_url + f'&page={page}'
    response = requests.get(curr_url, headers=headers).content
    return BeautifulSoup(response, 'lxml')
# crawl over the review pages for one drug
from tqdm import tqdm # progress bar

def crawl_reviews_pages(review_url, reviews_df):
    '''Crawls a drug's reviews page-by-page, saving each page's reviews into reviews_df
    
    Parameters:
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    # find how many review pages there are total for the drug by parsing it from the first page
    soup = get_soup(review_url, 1)
    pages = soup.find('ul', class_='pagination')
    
    # proceed only if there is at least 1 review
    if pages is not None:
        last_page = int(pages.find_all('li', class_='page-item')[-1].text.strip())

        for i in tqdm(range(1, last_page+1)):
            soup = get_soup(review_url, 1)
            reviews_df = parse_reviews_page(soup, reviews_df)

    return reviews_df
#Scrape Drugs by Condition from WebMD Search
# need to spoof a browser in order to not get blocked when making request
# https://bar.rady.ucsd.edu/Web_Scraping.html
from bs4 import BeautifulSoup

headers = requests.utils.default_headers()
agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
headers.update({
    'User-Agent': agent,
})
# make empty data frame to contain values and full text from each review
cols = ['drug_name',
       'date',
       'age',
       'gender',
       'time_on_drug',
       'reviewer_type',
       'condition',
       'rating_overall',
       'rating_effectiveness',
       'rating_ease_of_use',
       'rating_satisfaction',
       'text']

reviews_df = pd.DataFrame(columns=cols, index=[])
# # list of depression drugs from WebMD
# url = 'https://www.webmd.com/depression/depression-medications-antidepressants'
# response = requests.get(url, headers=headers).content
# soup = BeautifulSoup(response, 'lxml')
# drugs_section = soup.find('div', class_='article-page active-page')
# use WebMD's search to find drugs by illness
review_urls = []

conditions = {'type2diabetesmellitus': 'https://www.webmd.com/drugs/2/condition-594/type-2-diabetes-mellitus',
              #'gastric cancer': 'https://www.webmd.com/drugs/2/condition-514/gastric-cancer',
              'type1diabetesmellitus':'https://www.webmd.com/drugs/2/condition-595/type-1-diabetes-mellitus',
              'hypertension':'https://www.webmd.com/drugs/2/condition-1432/hypertension'
              }

for url in conditions.values():
    response = requests.get(url, headers=headers).content
    soup = BeautifulSoup(response, 'lxml')
    drugs_section = soup.find('div', class_='medication-results-list')

    for drug_review_element in drugs_section.find_all('span', class_='reviews-url'):
        review_urls.append(drug_review_element.a.get('href'))

# because there is a lot of overlap in medications between conditions, get unique values
review_urls = np.unique(review_urls)
review_urls

array(['https://reviews.webmd.com/drugs/drugreview-10094-glipizide',
       'https://reviews.webmd.com/drugs/drugreview-10094-glipizide-er',
       'https://reviews.webmd.com/drugs/drugreview-10095-glucotrol-xl',
       'https://reviews.webmd.com/drugs/drugreview-11273-adrucil-vial',
       'https://reviews.webmd.com/drugs/drugreview-11285-metformin-hcl',
       'https://reviews.webmd.com/drugs/drugreview-11285-metformin-hcl-er',
       'https://reviews.webmd.com/drugs/drugreview-11285-metformin-hcl-solution',
       'https://reviews.webmd.com/drugs/drugreview-11285-metformin-suspension-er-reconstituted-suspension-reconstituted',
       'https://reviews.webmd.com/drugs/drugreview-11294-glucophage-tablet',
       'https://reviews.webmd.com/drugs/drugreview-11668-dymelor-tablet',
       'https://reviews.webmd.com/drugs/drugreview-11773-glucotrol',
       'https://reviews.webmd.com/drugs/drugreview-11926-insulin-reg-human-buffered-solution',
       'https://reviews.webmd.com/drugs/drugrev

In [8]:
#only 1 drug- glipizide reviews
#response = requests.get('https://reviews.webmd.com/drugs/drugreview-10094-glipizide', headers=headers)  
#reviews_df = crawl_reviews_pages('https://reviews.webmd.com/drugs/drugreview-10094-glipizide', reviews_df)
#reviews_df.to_csv('C:/Users/Public/Data/Projects/webmdreviews/t2diabetesmellitus_glipizide_reviews.csv')

100%|██████████| 11/11 [00:21<00:00,  1.97s/it]


In [13]:
#all drugs
for review_url in tqdm(review_urls):
    response = requests.get(review_url, headers=headers)
    if response.url == 'https://www.webmd.com/404':
        print(f'skipping {review_url}: page not found')
    else:
        print(review_url)
        reviews_df = crawl_reviews_pages(review_url, reviews_df)
reviews_df.to_csv('C:/Users/Public/Data/Projects/webmdreviews/multiple_reviews.csv')

  0%|          | 0/351 [00:00<?, ?it/s]

https://reviews.webmd.com/drugs/drugreview-10094-glipizide


100%|██████████| 11/11 [00:23<00:00,  2.14s/it]
  0%|          | 1/351 [00:27<2:43:05, 27.96s/it]

https://reviews.webmd.com/drugs/drugreview-10094-glipizide-er


100%|██████████| 11/11 [00:24<00:00,  2.18s/it]
  1%|          | 2/351 [00:55<2:41:30, 27.77s/it]

https://reviews.webmd.com/drugs/drugreview-10095-glucotrol-xl


100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
  1%|          | 3/351 [01:01<1:43:17, 17.81s/it]

https://reviews.webmd.com/drugs/drugreview-11273-adrucil-vial


100%|██████████| 1/1 [00:02<00:00,  2.93s/it]
  1%|          | 4/351 [01:08<1:17:16, 13.36s/it]

https://reviews.webmd.com/drugs/drugreview-11285-metformin-hcl


 10%|█         | 9/87 [00:26<03:46,  2.90s/it]
  1%|          | 4/351 [01:38<2:22:14, 24.60s/it]


KeyboardInterrupt: 