Certybox  Web Scrapping

In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

# File paths
input_path = "C:\\Users\\taslim.siddiqui\\Downloads\\Certybox.xlsx"
output_file = "C:\\Users\\taslim.siddiqui\\Downloads\\certybox_course_data.xlsx"

# Read input Excel
df = pd.read_excel(input_path)

# Output container
course_data = []

# Static platform info
platform_name = "Certybox"
about_platform = (
    "Certybox continuously transforming careers through skills and certification training across the globe. "
    "Certybox is an Indian EdTech company registered with STARTUP INDIA & Ministry of MSME, Government of India. "
    "Our objective is to establish and run an online education system where we provide Online interactive platform with the best of the Skills + Technology."
)

for index, row in df.iterrows():
    course_url = row['Course Link']
    try:
        response = requests.get(course_url, timeout=15)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Course Name
        course_name_tag = soup.find("h1", class_="tutor-course-header-h1")
        course_name = course_name_tag.text.strip() if course_name_tag else ""

        # About Course
        about_div = soup.find("div", class_="tutor-course-content-content")
        about_course = about_div.get_text(separator=" ", strip=True) if about_div else ""

        # Course Syllabus (modules + lessons)
        syllabus = []
        modules = soup.find_all("div", class_="tutor-course-title")
        for mod in modules:
            h4 = mod.find("h4")
            if h4:
                syllabus.append(h4.text.strip())
            next_siblings = mod.find_next_siblings("span", class_="lesson-preview-title")
            for sib in next_siblings:
                syllabus.append(sib.text.strip())
        course_syllabus = "\n".join(syllabus)

        # Certificate Image
        cert_img = ""
        cert_div = soup.find("div", id="tutor-course-tab-reviews")
        if cert_div:
            img_tag = cert_div.find("img")
            if img_tag and img_tag.get("src"):
                cert_img = img_tag["src"]

        # Course Duration
        course_duration = ""
        duration_time = ""
        duration_medium = ""
        dur_div = soup.find("div", class_="tutor-course-duration")
        if dur_div:
            duration_text = dur_div.get_text(strip=True)
            match = re.search(r'(\d+)\s*(hours?|hrs?)', duration_text, re.IGNORECASE)
            if match:
                duration_time = match.group(1).strip()
                duration_medium = "Hours"
                course_duration = f"{duration_time} {duration_medium}"

        # Language
        lang_div = soup.find("div", class_="meta-value")
        language = " ".join(lang_div.stripped_strings) if lang_div else ""

        # Selling Price
        selling_price = ""
        price_tag = soup.find("span", class_="woocommerce-Price-amount amount")
        if price_tag:
            price_text = price_tag.text.replace(",", "").replace("₹", "").strip()
            selling_price = f"₹{price_text.split('.')[0]}"

        # Fee Structure
        fee_structure = (
            f"{selling_price}\n"
            "-All other fees remain unchanged\n"
            "-Education loans are available through leading banks and NBFCs.\n"
            "-EMI options are also available for your convenience."
        )

        # Target Audience
        target_audience = ""
        target_div = soup.find("div", class_="tutor-course-target-audience-content")
        if target_div:
            items = target_div.find_all("li")
            target_audience = "\n".join([li.text.strip() for li in items])

        # Append to list
        course_data.append({
            "Course Name": course_name,
            "Course Link": course_url,
            "About Course": about_course,
            "Course Syllabus": course_syllabus,
            "Certificate Image": cert_img,
            "Course Duration": course_duration,
            "Duration Medium": duration_medium,
            "Duration Time": duration_time,
            "Language": language,
            "Selling Price": selling_price,
            "Fee Structure": fee_structure,
            "Target Audience": target_audience,
            "Course provider platform Name": platform_name,
            "About Course provided platform": about_platform,
        })

    except Exception as e:
        print(f"❌ Error processing {course_url}: {e}")

# Save output
df_out = pd.DataFrame(course_data)
df_out.to_excel(output_file, index=False)
print(f"✅ Extracted {len(df_out)} course entries.\n📁 Saved to: {output_file}")


✅ Extracted 1 course entries.
📁 Saved to: C:\Users\taslim.siddiqui\Downloads\certybox_course_data.xlsx
