PHARMASTATE WEB SCRAPPING 

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

def clean_text(text):
    return re.sub(r'[^\x00-\x7F]+', '', text).strip()

def scrape_pharmastate_full(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Course Name
    course_name_div = soup.find('div', class_='elementor-widget-etlms-course-title')
    course_name = clean_text(course_name_div.get_text(strip=True)) if course_name_div else ''

    # About Course
    desc_div = soup.find('div', class_='tutor-single-course-segment etlms-course-description')
    about_course = ''
    if desc_div:
        raw_text = desc_div.get_text(separator=' ', strip=True)
        stop_keywords = [
            'what will you learn', 'why should you attend', 'What will your learn',
            'who should enroll', 'What will I learn ', 'language:',
            'objective of sop', 'how to download course certificate'
        ]
        stop_index = len(raw_text)
        for keyword in stop_keywords:
            idx = raw_text.lower().find(keyword)
            if idx != -1 and idx < stop_index:
                stop_index = idx
        about_course = raw_text[:stop_index].strip()
        if about_course.lower().startswith("description"):
            about_course = about_course[len("description"):].strip(": ").strip()

    # What I will learn
    what_learn = ''
    learn_ul = soup.find('ul', class_='etlms-course-widget-list-items')
    if learn_ul:
        items = learn_ul.find_all('span', class_='tutor-list-label')
        what_learn = '\n'.join(item.get_text(strip=True) for item in items)

    if not what_learn and desc_div:
        for tag in desc_div.find_all(['strong', 'h3', 'h2', 'b', 'p']):
            if 'what will you learn' in tag.get_text(strip=True).lower():
                next_node = tag.find_next_sibling()
                result = []
                while next_node and next_node.name in ['ul', 'p']:
                    text = next_node.get_text(strip=True)
                    if text:
                        result.append(text)
                    next_node = next_node.find_next_sibling()
                what_learn = '\n'.join(result)
                break

    for phrase in ['Language: English', 'How to Download Course certificate: Watch Video']:
        what_learn = what_learn.replace(phrase, '').strip()

    # Prices
    price_spans = soup.find_all('span', class_='woocommerce-Price-amount')
    prices = [span.find('bdi').get_text(strip=True).replace("₹", "").replace(",", "").strip()
              for span in price_spans if span.find('bdi')]
    selling_price, offer_price = '', ''
    if len(prices) >= 2:
        selling_price, offer_price = prices[0], prices[1]
    elif len(prices) == 1:
        offer_price = prices[0]
        selling_price = prices[0]
    if offer_price in ['0', '0.00', '', None] and selling_price:
        offer_price = selling_price

    # Duration
    duration_span = soup.find('span', class_='tutor-meta-value tutor-color-secondary tutor-mr-4')
    duration = duration_span.get_text(strip=True) if duration_span else ''
    duration = duration.replace("hours", "Hours").replace("minutes", "Minutes")

    # Duration Time & Rounded
    parts = soup.find_all('span', class_='tutor-meta-level')
    if len(parts) >= 2:
        h = int(parts[0].get_text(strip=True))
        m = int(parts[1].get_text(strip=True))
        duration_time = f"{h:02d}:{m:02d}"
        rounded = str(h) if m < 15 else str(h + 0.5 if m < 45 else h + 1)
    elif len(parts) == 1:
        h, m = 0, int(parts[0].get_text(strip=True))
        duration_time = str(m)
        rounded = str(m)
    else:
        duration_time = ''
        rounded = ''

    # Fee Structure
    fee_structure = (
        f"{offer_price} \n -All other fees remain unchanged\n"
        "-Education loans are available through leading banks and NBFCs.\n"
        
    ) if offer_price else (
        "-All other fees remain unchanged\n"
        "-Education loans are available through leading banks and NBFCs.\n"
        
    )

    # Language
    language = ''
    for p in soup.find_all('p'):
        if 'Language:' in p.get_text():
            language = p.get_text().replace("Language:", "").strip()
            break

    # Course Level
    course_level_span = soup.find('span', class_='tutor-fs-7 tutor-fw-medium tutor-color-black etlms-enrolled-label-value')
    course_level = course_level_span.get_text(strip=True) if course_level_span else ''
    if course_level.lower() == "all levels":
        course_level = "Beginner"

    # Certificate
    certificate = ''
    spans = soup.find_all('span', class_='tutor-fs-7 tutor-fw-medium tutor-color-black etlms-enrolled-label-value')
    for span in spans:
        text = span.get_text(strip=True)
        if 'certificate' in text.lower():
            certificate = text
            break

    # Certificate Image
    cert_img_tag = soup.find('img', alt='selected template')
    certificate_image = cert_img_tag['src'] if cert_img_tag and 'src' in cert_img_tag.attrs else ''

    # Who should take it
    who_should_take_it = ''
    for h3 in soup.find_all('h3'):
        if 'Target Audience' in h3.get_text():
            ul = h3.find_next_sibling('ul')
            if ul:
                who_should_take_it = ' \n '.join(li.get_text(strip=True) for li in ul.find_all('li'))

    # Educator
    educator_tags = soup.find_all('a', class_='tutor-instructor-name tutor-fs-6 tutor-fw-bold tutor-color-black')
    educators = [tag.get_text(strip=True) for tag in educator_tags if tag.get_text(strip=True).lower() != "pharmastate academy"]
    educator = ', '.join(educators)

    # Course Syllabus
    curriculum = []
    modules = soup.find_all('div', class_='tutor-accordion-item')
    for module in modules:
        module_title_tag = module.find('h4', class_='tutor-accordion-item-header')
        module_title = clean_text(module_title_tag.get_text(strip=True)) if module_title_tag else "Untitled Module"
        lesson_tags = module.find_all('li', class_='tutor-course-content-list-item')
        lesson_list = [f"- {clean_text(lesson.find('h5').get_text(strip=True))}" for lesson in lesson_tags if lesson.find('h5')]
        curriculum.append(f"🔸 {module_title}\n" + '\n'.join(lesson_list))

    course_syllabus = '\n\n'.join(curriculum)

    return {
        'Course Name': course_name,
        'Course Link': url,
        'About Course': about_course,
        'Selling Price': selling_price,
        'Offer Price': offer_price,
        'Duration': duration,
        'Duration Time': duration_time,
        'Rounded Duration': rounded,
        'Fee Structure': fee_structure,
        'Language': language,
        'Course Level': course_level,
        'Certificate': certificate,
        'Certificate Image': certificate_image,
        'Educator': educator,
        'Who should take it': who_should_take_it,
        'What I will learn': what_learn,
        'Course Syllabus': course_syllabus
    }

# === MAIN ===
if __name__ == "__main__":
    input_path = "C:\\Users\\taslim.siddiqui\\Downloads\\Courselink for pharma state cpp.xlsx"
    output_path = "C:\\Users\\taslim.siddiqui\\Downloads\\pharmastate_course_combined_FINAL_10_09_2025.xlsx"

    df_links = pd.read_excel(input_path)
    if 'Course Link' not in df_links.columns:
        raise ValueError("❌ 'Course Link' column not found in Excel file.")

    urls = df_links['Course Link'].dropna().unique()
    all_data = []

    for i, url in enumerate(urls):
        print(f"🔄 Scraping ({i+1}/{len(urls)}): {url}")
        try:
            data = scrape_pharmastate_full(url)
        except Exception as e:
            print(f"❌ Error scraping {url}: {e}")
            data = {
                'Course Name': '', 'Course Link': url, 'About Course': '',
                'Selling Price': '', 'Offer Price': '', 'Duration': '',
                'Duration Time': '', 'Rounded Duration': '', 'Fee Structure': '',
                'Language': '', 'Course Level': '', 'Certificate': '',
                'Certificate Image': '', 'Educator': '', 'Who should take it': '',
                'What I will learn': '', 'Course Syllabus': ''
            }
        all_data.append(data)
        time.sleep(1)

    df_out = pd.DataFrame(all_data)
    df_out.to_excel(output_path, index=False)
    print(f"\n✅ All courses saved to Excel: {output_path}")


🔄 Scraping (1/388): https://pharmastate.academy/courses/5-essentials-tips-to-generate-prescription/
🔄 Scraping (2/388): https://pharmastate.academy/courses/dealing-with-pharma-industrial-emergency-conditions-safety-guidelines/
🔄 Scraping (3/388): https://pharmastate.academy/courses/sop-for-determination-of-shelf-life-of-solution-in-laboratory/
🔄 Scraping (4/388): https://pharmastate.academy/courses/trending-of-breakdowns-in-equipment/
🔄 Scraping (5/388): https://pharmastate.academy/courses/incident-investigation/
🔄 Scraping (6/388): https://pharmastate.academy/courses/sop-for-determination-of-leak-test-of-packed-material/
🔄 Scraping (7/388): https://pharmastate.academy/courses/liquid-particulate-measuring-system/
🔄 Scraping (8/388): https://pharmastate.academy/courses/how-to-define-incident-description/
🔄 Scraping (9/388): https://pharmastate.academy/courses/human-error-reduction/
🔄 Scraping (10/388): https://pharmastate.academy/courses/effective-sop/
🔄 Scraping (11/388): https://pharm