Course link Extraction

In [15]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def extract_pw_courses(html_content, base_url="https://www.pw.live"):
    """Extract ALL course names and links from Physics Wallah HTML"""
    soup = BeautifulSoup(html_content, 'html.parser')
    courses = []
    
    # Look for course links in multiple potential locations
    # 1. Main navigation menu items
    nav_links = soup.select('nav a[href], .flex a[href], .flex-col a[href]')
    
    # 2. Course category sections
    course_sections = soup.select('a[href*="/batches"], a[href*="/online-course"]')
    
    # 3. Exam category links
    exam_links = soup.select('a[href*="/iit-jee"], a[href*="/neet"], a[href*="/upsc"], a[href*="/gate"]')
    
    # 4. Footer course links
    footer_links = soup.select('footer a[href]')
    
    # Combine all potential course links
    all_links = nav_links + course_sections + exam_links + footer_links
    
    for link in all_links:
        href = link.get('href', '').strip()
        text = link.get_text(strip=True)
        
        # Skip if no href or text is too short
        if not href or len(text) < 2:
            continue
            
        # Skip non-course links
        if (href.startswith(('#', 'mailto:', 'tel:')) or
            any(x in href for x in ['/about', '/contact', '/privacy', '/terms', 'facebook', 'twitter', 
                                   'instagram', 'youtube', 'linkedin', '.jpg', '.png', '.webp'])):
            continue
        
        # Make sure it's a course-related link
        if (any(x in href for x in ['/batches', '/online-course', '/iit-jee', '/neet', '/upsc', '/gate',
                                  '/ssc', '/banking', '/teaching', '/defence', '/ca', '/olympiad', '/mba',
                                  '/commerce', '/cuet', '/ae-je', '/law', '/ese-gate', '/ipmat', '/ielts']) or
            any(x in text.lower() for x in ['batch', 'course', 'program', 'coaching', 'exam', 'preparation'])):
            
            full_url = urljoin(base_url, href)
            
            courses.append({
                "Course Name": re.sub(r'\s+', ' ', text),
                "Course Link": full_url
            })
    
    # Remove duplicates
    df = pd.DataFrame(courses).drop_duplicates(subset=["Course Link", "Course Name"])
    return df.sort_values("Course Name").reset_index(drop=True)

def main():
    # Load HTML file
    with open(r"C:\Users\taslim.siddiqui\Downloads\Physics wallah Live Courses for JEE, NEET & Class 6,7,8,9,10,11,12 _ NCERT Solutions - Physics Wallah.html", "r", encoding="utf-8") as f:
        html = f.read()
    
    # Extract courses
    df = extract_pw_courses(html)
    
    # Save to Excel
    output_path = "C:\\Users\\taslim.siddiqui\\Downloads\\PhysicsWallah_Courses_Complete.xlsx"
    df.to_excel(output_path, index=False)
    
    print(f"✅ Extracted {len(df)} courses from Physics Wallah!")
    print(f"📂 Saved to: {output_path}\n")
    print("Sample Courses:")
    print(df.head(15).to_string(index=False))

if __name__ == "__main__":
    main()

✅ Extracted 78 courses from Physics Wallah!
📂 Saved to: C:\Users\taslim.siddiqui\Downloads\PhysicsWallah_Courses_Complete.xlsx

Sample Courses:
                     Course Name                                      Course Link
                       ACCA Exam                         https://www.pw.live/acca
                           AE/JE                        https://www.pw.live/ae-je
        Agniveer Online Coaching     https://www.pw.live/defence/agniveer/batches
Agriculture Exam Online Coaching          https://www.pw.live/agriculture/batches
               Agriculture Exams                  https://www.pw.live/agriculture
            BPSC Online Coaching       https://www.pw.live/state-psc/bpsc/batches
       Bank Exam Online Coaching              https://www.pw.live/banking/batches
                    Banking Exam                      https://www.pw.live/banking
      Bihar Exam Online Coaching                https://www.pw.live/bihar/batches
                         CA Exam    

In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SCRAPER --------------------
def scrape_course_data(driver, url):
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # wait for body
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # -------- Course Name --------
        course_name = "Course name not available"
        h1_tag = soup.find("h1")
        if h1_tag:
            course_name = h1_tag.get_text(strip=True)

        # -------- Learning Mode --------
        learning_mode = "Learning mode not available"
        try:
            mode_el = driver.find_element(
                By.XPATH,
                "//span[contains(text(),'Live') or contains(text(),'Online') or contains(text(),'Offline')]"
            )
            learning_mode = mode_el.text.strip()
        except:
            pass

        # -------- Features (like "Live session with PW Star Faculty") --------
        features = []
        try:
            feature_tags = soup.find_all("div", class_="Counselling_featureItem__791_G")
            for f in feature_tags:
                text_span = f.find("span")
                if text_span:
                    features.append(text_span.get_text(strip=True))
        except:
            pass

        return {
            "Course Name": course_name,
            "Learning Mode": learning_mode,
            "Features": "; ".join(features) if features else "Not available",
            "URL": url
        }

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return None

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://www.pw.live/offline-centres/batches/vidyapeeth/kota-rj/kota-vidyapeeth-072989/neet-coaching/vidyapeeth-11th-neet--target-2026--969326",
        "https://www.pw.live/offline-centres/batches/vidyapeeth/agra-up/agra-vidyapeeth--pathshala-centre--375574/iit-jee-coaching/vidyapeeth-dropper-jee--target-2025--109274"
    ]

    print(f"🚀 Starting scraping process for {len(course_urls)} links...")
    driver = get_driver()
    results = []

    for url in course_urls:
        data = scrape_course_data(driver, url)
        if data:
            results.append(data)

    driver.quit()
    print("🚪 Browser closed")

    if results:
        df = pd.DataFrame(results)
        df.to_excel(r"C:\Users\taslim.siddiqui\Downloads\course_data.xlsx", index=False)
        print("\n✅ Data saved to course_data.xlsx")
        print(df)


🚀 Starting scraping process for 2 links...
🌐 Accessing URL: https://www.pw.live/offline-centres/batches/vidyapeeth/kota-rj/kota-vidyapeeth-072989/neet-coaching/vidyapeeth-11th-neet--target-2026--969326
🌐 Accessing URL: https://www.pw.live/offline-centres/batches/vidyapeeth/agra-up/agra-vidyapeeth--pathshala-centre--375574/iit-jee-coaching/vidyapeeth-dropper-jee--target-2025--109274
🚪 Browser closed

✅ Data saved to course_data.xlsx
                            Course Name  \
0    Vidyapeeth 11th NEET (Target 2026)   
1  Vidyapeeth DROPPER JEE (Target 2025)   

                                       Learning Mode  \
0  Smart Interactive Classroom Configuration for ...   
1  7. Vidyapeeth Centers Offline Counseling Services   

                                            Features  \
0  Live session with PW Star Faculty; Get tips & ...   
1  Live session with PW Star Faculty; Get tips & ...   

                                                 URL  
0  https://www.pw.live/offline-centres/ba

In [9]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    driver = uc.Chrome(options=options)  # auto detect Chrome version
    return driver

# -------------------- SCRAPER --------------------
def scrape_course_data(driver, url):
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # wait for body
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # -------- Course Name --------
        course_name = "Course name not available"
        h1_tag = soup.find("h1")
        if h1_tag:
            course_name = h1_tag.get_text(strip=True)

        # -------- Learning Mode --------
        learning_mode = "Learning mode not available"
        try:
            mode_els = driver.find_elements(
                By.XPATH,
                "//span[contains(text(),'Live') or contains(text(),'Online') or contains(text(),'Offline')]"
            )
            if mode_els:
                learning_mode = "; ".join([el.text.strip() for el in mode_els if el.text.strip()])
        except Exception as e:
            print(f"⚠️ Error extracting learning mode: {e}")

        # -------- Features (like "Live session with PW Star Faculty") --------
        features = []
        try:
            feature_tags = soup.select("div.Counselling_featureItem__791_G span")
            features = [f.get_text(strip=True) for f in feature_tags]
        except Exception as e:
            print(f"⚠️ Error extracting features: {e}")
        features = "; ".join(features) if features else "Not available"

        # -------- Price --------
        price = "Price not available"
        try:
            # Find any h4 that contains the rupee symbol
            price_tag = soup.find("h4", string=lambda t: t and "₹" in t)
            if price_tag:
                price = price_tag.get_text(strip=True)
        except Exception as e:
            print(f"⚠️ Error extracting price: {e}")

        return {
            "Course Name": course_name,
            "Learning Mode": learning_mode,
            "Features": features,
            "Price": price,   # e.g. "₹ 1,45,000"
            "URL": url
        }

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return None

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://www.pw.live/offline-centres/batches/vidyapeeth/kota-rj/kota-vidyapeeth-072989/neet-coaching/vidyapeeth-11th-neet--target-2026--969326",
        "https://www.pw.live/offline-centres/batches/vidyapeeth/agra-up/agra-vidyapeeth--pathshala-centre--375574/iit-jee-coaching/vidyapeeth-dropper-jee--target-2025--109274"
    ]

    print(f"🚀 Starting scraping process for {len(course_urls)} links...")
    driver = get_driver()
    results = []

    for url in course_urls:
        data = scrape_course_data(driver, url)
        if data:
            results.append(data)

    driver.quit()
    print("🚪 Browser closed")

    if results:
        df = pd.DataFrame(results)
        df.to_excel(r"C:\Users\taslim.siddiqui\Downloads\course_data.xlsx", index=False)
        print("\n✅ Data saved to course_data.xlsx")
        print(df)


🚀 Starting scraping process for 2 links...
🌐 Accessing URL: https://www.pw.live/offline-centres/batches/vidyapeeth/kota-rj/kota-vidyapeeth-072989/neet-coaching/vidyapeeth-11th-neet--target-2026--969326
🌐 Accessing URL: https://www.pw.live/offline-centres/batches/vidyapeeth/agra-up/agra-vidyapeeth--pathshala-centre--375574/iit-jee-coaching/vidyapeeth-dropper-jee--target-2025--109274
🚪 Browser closed

✅ Data saved to course_data.xlsx
                            Course Name  \
0             Course name not available   
1  Vidyapeeth DROPPER JEE (Target 2025)   

                                       Learning Mode  \
0                        Learning mode not available   
1  7. Vidyapeeth Centers Offline Counseling Servi...   

                                            Features                Price  \
0                                      Not available  Price not available   
1  Live session with PW Star Faculty; Get tips & ...  Price not available   

                                 