In [1]:
pip install requests beautifulsoup4 pandas selenium webdriver-manager




In [None]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

url = "https://skillsbuild.org/college-students/course-catalog"
headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Locate the JSON data embedded in the page
next_data_script = soup.find("script", id="__NEXT_DATA__")
next_data_json = json.loads(next_data_script.string)

In [None]:
for key in next_data_json:
    print(key)

In [None]:
print(json.dumps(next_data_json["props"]["pageProps"], indent=2)[:3000])

In [None]:
badges = next_data_json["props"]["pageProps"]["page"]["rebrandBadgePageFields"]["badgeSection"]

# Extract course info
courses = []
for section in badges:
    for badge in section.get("badges", []):
        nodes = badge.get("collegeStudentBadge", {}).get("nodes", [])
        for node in nodes:
            course = {
                "title": node.get("title"),
                "slug": node.get("slug"),
                "description": node.get("rebrandBadgeFields", {}).get("description"),
                "duration": node.get("rebrandBadgeFields", {}).get("duration"),
                "link": node.get("rebrandBadgeFields", {}).get("link"),
                "image": node.get("rebrandBadgeFields", {}).get("image", {}).get("node", {}).get("sourceUrl"),
            }
            courses.append(course)

# Preview one course
print(courses[0])


In [None]:
import csv

# Assuming `courses` is your list of course dictionaries from before
csv_filename = "skillsbuild_courses.csv"

# Define the header based on keys in your course dictionaries
headers = ["title", "slug", "description", "duration", "link", "image"]

# Write to CSV
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()
    writer.writerows(courses)

print(f"Courses exported to {csv_filename}")


In [None]:
import pandas as pd
import re # Import regex module for more flexible string operations

def clean_skillsbuild_courses(input_csv_path="skillsbuild_courses.csv", output_csv_path="skillsbuild_courses_cleaned.csv"):
    """
    Cleans the SkillsBuild course data from a CSV file,
    including specific character replacements, duration parsing,
    and extraction of duration units.

    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path to save the cleaned CSV file.
    """
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Original DataFrame shape: {df.shape}")
        print("Original DataFrame head:")
        print(df.head())
        print("\nOriginal DataFrame info:")
        df.info()

        # --- Cleaning Steps ---

        # 1. Handle Missing Values: Fill or drop based on column importance
        df['description'] = df['description'].fillna('')
        # Keep original duration for unit extraction before numerical conversion
        df['original_duration_str'] = df['duration'].fillna('').astype(str) # NEW: Store original string for unit extraction
        df['duration'] = df['duration'].fillna('') # Fill for consistency, will be re-processed later
        df['link'] = df['link'].fillna('')
        df['image'] = df['image'].fillna('')
                df.dropna(subset=['title', 'slug'], inplace=True)
        print(f"\nDataFrame shape after dropping rows with missing 'title' or 'slug': {df.shape}")

        # 2. Remove Duplicates: Based on 'title' and 'link'
        df.drop_duplicates(subset=['title', 'link'], inplace=True)
        print(f"DataFrame shape after dropping duplicates: {df.shape}")

        # 3. Clean 'title' and 'description' columns: Remove leading/trailing whitespace
        df['title'] = df['title'].str.strip()
        df['description'] = df['description'].str.strip()

        # --- NEW CLEANING STEPS (Character Replacements) ---

        # 4. Replace specific characters in all string columns
        def apply_replacements(text):
            if isinstance(text, str):
                text = text.replace('â„¢', 'TM')
                text = text.replace('â€™', "'")   # NEW: Added this replacement
                text = text.replace('Â®', '®')
                text = text.replace('<p>', '')
                text = text.replace('</p>', '')
                text = text.replace('&#8217;', "'")
            return text

        # Apply the replacements to all relevant string columns
        # Note: 'original_duration_str' is also a string, so it gets cleaned too.
        for col in ['title', 'slug', 'description', 'original_duration_str', 'link', 'image']:
            if col in df.columns and df[col].dtype == 'object':
                df[col] = df[col].apply(apply_replacements)
        print("\nApplied specific character and HTML tag replacements.")

        # --- NEW COLUMN: unit_duration ---
        df['unit_duration'] = None # Initialize new column

        def extract_duration_unit(duration_str):
            duration_str_lower = str(duration_str).lower()
            if 'minute' in duration_str_lower:
                return 'minutes'
            elif 'hour' in duration_str_lower:
                return 'hours'
            elif 'day' in duration_str_lower:
                return 'days'
            elif 'week' in duration_str_lower:
                return 'weeks'
            elif 'month' in duration_str_lower:
                return 'months'
            elif 'year' in duration_str_lower:
                return 'years'
            # Add more specific unit checks if necessary
            # For general cases, you might extract any word next to a number
            match = re.search(r'\d+\s*([a-zA-Z]+)', duration_str_lower)
            if match:
                unit = match.group(1)
                # You might want to normalize units (e.g., "hrs" to "hours")
                if unit.startswith('hr'): return 'hours'
                if unit.startswith('min'): return 'minutes'
                if unit.startswith('wk'): return 'weeks'
                if unit.startswith('day'): return 'days'
                if unit.startswith('mo'): return 'months'
                if unit.startswith('yr'): return 'years'
                return unit # Return the extracted unit if not specifically mapped
            return None # Default if no unit found
        # Apply unit extraction to the stored original duration string
        df['unit_duration'] = df['original_duration_str'].apply(extract_duration_unit)
        print("\nCreated 'unit_duration' column.")

        # 5. Clean 'duration' column: Only keep the first 3 digits and make sure they are numbers
        # Use the stored original string for this, as it's cleaner than the partially filled 'duration'
        df['duration'] = df['original_duration_str'].astype(str) # Re-assign original string to 'duration' for processing

        def extract_duration_digits(duration_str):
            numbers = re.findall(r'\d+', duration_str)
            if numbers:
                first_num = numbers[0]
                cleaned_num_str = first_num[:3]
                try:
                    return int(cleaned_num_str)
                except ValueError:
                    return None
            return None

        df['duration'] = df['duration'].apply(extract_duration_digits)
        df['duration'] = df['duration'].fillna(0) # Fill with 0 for missing/unparseable numeric durations
        df['duration'] = pd.to_numeric(df['duration'], errors='coerce').astype('Int64') # Ensure integer type
        print("\nCleaned 'duration' column to keep only first 3 digits and convert to number.")

        # 6. Validate 'link' format
        df['link'] = df['link'].astype(str).apply(lambda x: x if x.startswith(('http://', 'https://')) else '')

        # 7. Remove rows where 'title' is empty after stripping
        df = df[df['title'] != '']
        print(f"DataFrame shape after removing rows with empty 'title': {df.shape}")
       # --- Remove the temporary 'original_duration_str' column ---
        df.drop(columns=['original_duration_str'], inplace=True)
        print("\nDropped temporary 'original_duration_str' column.")

        # --- Save the Cleaned Data ---
        df.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"\nCleaned data exported to {output_csv_path}")
        print("\nCleaned DataFrame head:")
        print(df.head())
        print("\nCleaned DataFrame info:")
        df.info()

    except FileNotFoundError:
        print(f"Error: The file '{input_csv_path}' was not found. Please ensure the CSV file exists.")
    except Exception as e:
        print(f"An error occurred during cleaning: {e}")


**Data Science Courses**

In [2]:
pip install webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import re
import json
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome options
options = Options()

# Use ChromeDriverManager to automatically handle the ChromeDriver executable
s = Service(ChromeDriverManager().install())

# Initialize the Chrome driver (only once)
driver = webdriver.Chrome(service=s, options=options)

# Add a WebDriverWait instance
wait = WebDriverWait(driver, 30)

def handle_cookie_consent(driver, wait):
    """
    Attempts to find and click an 'Accept all' or 'Required only' button on a cookie consent banner.
    """
    print("Attempting to handle cookie consent banner...")
    try:
        accept_all_button_locator = (By.XPATH, "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept all')]")
        accept_all_button = wait.until(EC.element_to_be_clickable(accept_all_button_locator))
        accept_all_button.click()
        print("Clicked 'Accept all' on cookie banner.")
        time.sleep(2)
        return True
    except TimeoutException:
        print("No 'Accept all' button found or it was not clickable within timeout.")
        pass
    except Exception as e:
        print(f"Error clicking 'Accept all' button: {e}")
        pass

    try:
        required_only_button_locator = (By.XPATH, "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'required only')]")
        required_only_button = wait.until(EC.element_to_be_clickable(required_only_button_locator))
        required_only_button.click()
        print("Clicked 'Required only' on cookie banner.")
        time.sleep(2)
        return True
    except TimeoutException:
        print("No 'Required only' button found or it was not clickable within timeout.")
        pass
    except Exception as e:
        print(f"Error clicking 'Required only' button: {e}")
        pass

    print("Cookie banner not handled (may not be present or different structure).")
    return False

# --- Login Process ---
# This part remains the same as you need to log in only once before accessing any course pages.
driver.get('http://www.google.com/')
time.sleep(1)

print("Navigating to initial IBM SkillsBuild login page...")
driver.get("https://sb-auth.skillsbuild.org/login?ngo-id=0302")
time.sleep(3)
handle_cookie_consent(driver, wait)

print("Attempting to click 'Log in with Email' button...")
try:
    email_login_button_locator = (By.XPATH, "//a[@data-attribute1='Log in with Email']")
    email_login_button = wait.until(EC.element_to_be_clickable(email_login_button_locator))
    email_login_button.click()
    print("Clicked 'Log in with Email' button.")
    time.sleep(5)
except Exception as e:
    print(f"Failed to click 'Log in with Email' button: {e}")
    driver.quit()
    exit()

user_email = input("Please enter your email: ")
user_password = input("Please enter your password: ")

try:
    print("Attempting to find and fill email field...")
    email_field = wait.until(EC.presence_of_element_located((By.ID, "ibm-label-0")))
    email_field.send_keys(user_email)
    print("Email entered.")

    print("Attempting to find and fill password field...")
    password_field = wait.until(EC.presence_of_element_located((By.ID, "ibm-label-1")))
    password_field.send_keys(user_password)
    print("Password entered.")

    print("Attempting to find and click submit button...")
    submit_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@type='submit']")))
    submit_button.click()
    print("Submit button clicked.")

    time.sleep(5)
    print("Login process complete. Proceeding to target pages.")

except Exception as e:
    print(f"An error occurred during login form submission: {e}")
    driver.quit()
    exit()


# --- List of Target URLs ---
target_urls = [
    "https://skills.yourlearning.ibm.com/activity/PLAN-14F2691E3A32?ngo-id=0302", # Getting Started with Data
    "https://skills.yourlearning.ibm.com/activity/PLAN-BC0FAEE8E439?ngo-id=0302", # Data Science Foundations
    "https://skills.yourlearning.ibm.com/activity/PLAN-0D62D9A52C35?ngo-id=0302&_gl=1*rcdm7y*_ga*NjUyOTg1MDUyLjE3NDkxMjA0MTI.*_ga_FYECCCS21D*czE3NDkxMjc2ODMkbzIkZzAkdDE3NDkxMjc2ODMkajYwJGwwJGgw", # Data Analytics
    "https://skills.yourlearning.ibm.com/activity/PLAN-D8E7C82C1D76?ngo-id=0302&_gl=1*1ijk7w0*_ga*NjUyOTg1MDUyLjE3NDkxMjA0MTI.*_ga_FYECCCS21D*czE3NDkxMjc2ODMkbzIkZzAkdDE3NDkxMjc2ODQkajU5JGwwJGgw"  # AI Foundations
]

# --- Lists to store data from all courses ---
all_general_info_data = []
all_courses_data = []
all_comments_data = []
all_rating_breakdown_data = []

# --- Loop through each URL and scrape ---
for i, url in enumerate(target_urls):
    print(f"\n--- Scraping Course {i+1} of {len(target_urls)}: {url} ---")
    driver.get(url)
    handle_cookie_consent(driver, wait) # Re-check for cookies on new page load

    scraped_data = {} # Reset scraped_data for each URL
    
    # Add a unique identifier for each course, e.g., its URL or title
    scraped_data['course_url'] = url

    try:
        # Wait for a prominent element on the page to ensure it's loaded
        wait.until(EC.presence_of_element_located((By.XPATH, "//h1[contains(@class, 'FullPageHeader_fullPageHeader__title__')]/span")))
        print("Activity page loaded successfully.")
        time.sleep(5) # Give it a bit more time for dynamic content
    except TimeoutException:
        print(f"Timed out waiting for activity page {url} to load. Skipping this URL.")
        continue # Skip to the next URL
    except Exception as e:
        print(f"An unexpected error occurred while loading page {url}: {e}. Skipping this URL.")
        continue

    # --- Start scraping individual elements for the current URL ---

    # 1. Scrape Learning Plan Title
    try:
        title_element = driver.find_element(By.XPATH, "//h1[contains(@class, 'FullPageHeader_fullPageHeader__title__')]/span")
        scraped_data['learning_plan_title'] = title_element.text.strip()
        print(f"Scraped Title: {scraped_data['learning_plan_title']}")
    except NoSuchElementException:
        print("Could not find Learning Plan Title.")
        scraped_data['learning_plan_title'] = "N/A"

    # 2. Scrape overall duration, learners amount, and overall rating from header
    try:
        header_info_div = driver.find_element(By.XPATH, "//div[contains(@class, 'FullPageHeader_fullPageHeader__info__')]")
        try:
            duration_element = header_info_div.find_element(By.XPATH, ".//div[contains(@class, 'Time_container__')]/div[contains(@id, 'a11y-undefined-time')]/span[not(contains(@class, 'sr-only'))]")
            scraped_data['overall_duration'] = duration_element.text.strip()
        except NoSuchElementException:
            scraped_data['overall_duration'] = "N/A"

        try:
            learners_element = header_info_div.find_element(By.XPATH, ".//div[contains(@class, 'LearnersAmount_learnersAmount__')]")
            learners_title = learners_element.get_attribute('title')
            scraped_data['overall_learners_amount_text'] = learners_title.strip() if learners_title else "N/A"
            match = re.search(r'(\d[\d,\.]*) learners', scraped_data['overall_learners_amount_text'])
            if match:
                scraped_data['overall_learners_amount_numeric'] = match.group(1).replace('.', '').replace(',', '')
            else:
                scraped_data['overall_learners_amount_numeric'] = "N/A"
        except NoSuchElementException:
            scraped_data['overall_learners_amount_text'] = "N/A"
            scraped_data['overall_learners_amount_numeric'] = "N/A"

        try:
            overall_rating_element = header_info_div.find_element(By.XPATH, ".//div[contains(@class, 'Stars_starRating__C3-hw')]")
            overall_rating_title = overall_rating_element.get_attribute('title')
            scraped_data['overall_rating_header_text'] = overall_rating_title.strip() if overall_rating_title else "N/A"
            rating_match = re.search(r'Average rating of ([\d\.]+) stars by (\d+) learners', scraped_data['overall_rating_header_text'])
            if rating_match:
                scraped_data['overall_average_rating_from_header'] = rating_match.group(1)
                scraped_data['overall_learners_rated_from_header'] = rating_match.group(2)
            else:
                scraped_data['overall_average_rating_from_header'] = "N/A"
                scraped_data['overall_learners_rated_from_header'] = "N/A"
        except NoSuchElementException:
            scraped_data['overall_rating_header_text'] = "N/A"
            scraped_data['overall_average_rating_from_header'] = "N/A"
            scraped_data['overall_learners_rated_from_header'] = "N/A"

        print(f"Scraped Overall Duration: {scraped_data.get('overall_duration')}")
        print(f"Scraped Overall Learners: {scraped_data.get('overall_learners_amount_text')} (Numeric: {scraped_data.get('overall_learners_amount_numeric')})")
        print(f"Scraped Overall Rating (Header): {scraped_data.get('overall_rating_header_text')}")
    except Exception as e:
        print(f"Error scraping header info: {e}")
        scraped_data['overall_duration'] = "N/A"
        scraped_data['overall_learners_amount_text'] = "N/A"
        scraped_data['overall_learners_amount_numeric'] = "N/A"
        scraped_data['overall_rating_header_text'] = "N/A"
        scraped_data['overall_average_rating_from_header'] = "N/A"
        scraped_data['overall_learners_rated_from_header'] = "N/A"

    # 3. Scrape "About this learning plan" description
    try:
        description_content_div = wait.until(EC.presence_of_element_located(
            (By.XPATH, "//div[contains(@class, 'FullPageDescription_wrapper__')]/div[contains(@class, 'FullPageDescription_content__')]")
        ))
        scraped_data['about_learning_plan'] = description_content_div.text.strip()
        print(f"Scraped 'About' description: {scraped_data['about_learning_plan'][:100]}...")
    except NoSuchElementException:
        print("Could not find 'About this learning plan' description content.")
        scraped_data['about_learning_plan'] = "N/A"
    except Exception as e:
        print(f"Error scraping 'About this learning plan' description: {e}")
        scraped_data['about_learning_plan'] = "N/A"

    # 4. Scrape Courses and their details
    current_course_list = [] # Store courses for the current URL
    try:
        print("\n--- Attempting to scrape Courses section ---")
        courses_section_parent_locator = (By.ID, "learningPlanSectionSECTION-B")
        
        # New: More direct locator for the button based on the HTML you provided
        # The button has role="button" implicitly or explicitly, and contains the title "Course: ..."
        # Let's try to find it by its title or direct relation to the parent li
        courses_accordion_button_locator = (By.XPATH, 
            "//div[@id='learningPlanSectionSECTION-B']//li[contains(@class, 'bx-yl--accordion__item')]//button[contains(@class, 'bx-yl--accordion__heading')]"
        )

        courses_section_found_and_handled = False 

        try:
            print(f"Waiting for courses section parent '{courses_section_parent_locator[1]}' to be present...")
            courses_section_parent = wait.until(EC.presence_of_element_located(courses_section_parent_locator))
            print("Found courses section parent.")
            driver.execute_script("arguments[0].scrollIntoView(true);", courses_section_parent)
            print("Scrolled entire 'Courses' section parent into view (top alignment).")
            time.sleep(3)

            print("Attempting to find 'Courses' accordion button...")
            courses_accordion_button = wait.until(EC.presence_of_element_located(courses_accordion_button_locator))
            print("Found 'Courses' accordion button.")
            
            # Check aria-expanded state directly
            if courses_accordion_button.get_attribute("aria-expanded") == "false":
                print("Accordion is collapsed, attempting to click to expand.")
                # Ensure it's clickable before clicking
                wait.until(EC.element_to_be_clickable(courses_accordion_button_locator)).click()
                print("Clicked to expand 'Courses' accordion.")
                time.sleep(5)
            else:
                print("'Courses' accordion is already expanded.")
                time.sleep(2)
            
            courses_section_found_and_handled = True

        except TimeoutException:
            print("Timeout: 'Courses' accordion button not found within timeout.")
            # Even if the button isn't clickable, the content might still be there if it's already expanded
            # So, we don't necessarily set courses_section_found_and_handled to False here yet.
            # We will try to find content anyway.
            pass # Keep it False for now if button isn't clickable
        except StaleElementReferenceException:
            print("Stale element reference for 'Courses' accordion button. Retrying to find.")
            try:
                courses_accordion_button = wait.until(EC.presence_of_element_located(courses_accordion_button_locator))
                driver.execute_script("arguments[0].scrollIntoView(true);", courses_accordion_button)
                time.sleep(3)
                if courses_accordion_button.get_attribute("aria-expanded") == "false":
                    wait.until(EC.element_to_be_clickable(courses_accordion_button_locator)).click()
                    print("Clicked to expand 'Courses' accordion after stale error.")
                    time.sleep(5)
                courses_section_found_and_handled = True
            except Exception as e:
                print(f"Failed to handle stale element for Courses accordion on retry: {e}")
                # If retry also fails, then it truly wasn't handled.
                scraped_data['courses_section_status'] = "Failed to handle stale accordion button"
                courses_section_found_and_handled = False # Explicitly set to False here
        except Exception as e:
            print(f"An unexpected error occurred while handling Courses accordion button: {e}")
            scraped_data['courses_section_status'] = "Error during accordion button handling"
            courses_section_found_and_handled = False # Explicitly set to False here

        # Regardless of whether the accordion was clicked or not, if the parent section was found,
        # we should attempt to scrape the course cards, as they might be visible by default.
        if courses_section_parent: # Check if the parent div was found
            try:
                print("Attempting to find 'Complete X required' text...")
                # It's inside the same structure as the accordion content
                num_courses_text_element = wait.until(EC.presence_of_element_located(
                    (By.XPATH, "//div[@id='learningPlanSectionSECTION-B']//div[contains(@class, 'LearningPlanItems_description__')]/span[contains(text(), 'Complete')]")
                ))
                scraped_data['total_courses_required_text'] = num_courses_text_element.text.strip()
                print(f"Scraped Total Courses Text: {scraped_data['total_courses_required_text']}")
            except TimeoutException:
                print("Timeout: Could not find 'Complete X required' text after accordion expansion/check.")
                scraped_data['total_courses_required_text'] = "N/A"
            except Exception as e:
                print(f"Error finding 'Complete X required' text: {e}")
                scraped_data['total_courses_required_text'] = "N/A"

            try:
                print("Waiting for course cards to be visible...")
                # Use a more specific locator for the cards within the section
                course_card_containers = wait.until(EC.visibility_of_all_elements_located(
                    (By.XPATH, "//div[@id='learningPlanSectionSECTION-B']//div[contains(@class, 'ItemCard_itemCardContainer__')]")
                ))
                print(f"Found {len(course_card_containers)} course elements.")

                for j, card_container in enumerate(course_card_containers):
                    course_info = {}
                    course_info['parent_course_url'] = url # Link course to its parent URL
                    course_info['parent_course_title'] = scraped_data.get('learning_plan_title', 'N/A')
                    # ... (rest of your course scraping logic for name, duration, learners, rating) ...
                    try:
                        course_info['name'] = card_container.find_element(By.XPATH, ".//div[contains(@class, 'ItemCardComponents_searchTitle__')]/span").text.strip()
                    except NoSuchElementException:
                        course_info['name'] = "N/A"
                    
                    try:
                        duration_element = card_container.find_element(By.XPATH, ".//div[contains(@class, 'ActivityDuration_activityDuration__')]")
                        duration_text = duration_element.text.strip()
                        course_info['duration'] = re.sub(r'Duration is\s*', '', duration_text, flags=re.IGNORECASE).strip()
                    except NoSuchElementException:
                        course_info['duration'] = "N/A"

                    try:
                        learners_amount_div = card_container.find_element(By.XPATH, ".//div[contains(@class, 'LearnersAmount_learnersAmount__')]")
                        learners_text_raw = learners_amount_div.get_attribute('title')
                        course_info['learners_text'] = learners_text_raw.strip() if learners_text_raw else "N/A"
                        learners_count_span = learners_amount_div.find_element(By.XPATH, ".//span[@aria-hidden='true']")
                        course_info['learners_numeric'] = learners_count_span.text.strip().replace('.', '').replace(',', '')
                    except (NoSuchElementException, StaleElementReferenceException):
                        print(f"Warning: Could not scrape learners for course {j+1} in {url}. Skipping or setting N/A.")
                        course_info['learners_text'] = "N/A"
                        course_info['learners_numeric'] = "N/A"

                    try:
                        rating_element_div = card_container.find_element(By.XPATH, ".//div[contains(@class, 'Stars_starRating__')]")
                        rating_text_raw = rating_element_div.get_attribute('title')
                        course_info['rating_text'] = rating_text_raw.strip() if rating_text_raw else "N/A"
                        num_ratings_span = rating_element_div.find_element(By.XPATH, ".//span[contains(@class, 'Stars_numRatings__')]")
                        course_info['num_learners_rated_course'] = num_ratings_span.text.strip()
                        rating_match = re.search(r'Average rating of ([\d\.]+) stars', course_info['rating_text'])
                        if rating_match:
                            course_info['average_rating_course'] = rating_match.group(1)
                        else:
                            course_info['average_rating_course'] = "N/A"
                    except (NoSuchElementException, StaleElementReferenceException):
                        print(f"Warning: Could not scrape rating for course {j+1} in {url}. Skipping or setting N/A.")
                        course_info['rating_text'] = "N/A"
                        course_info['num_learners_rated_course'] = "N/A"
                        course_info['average_rating_course'] = "N/A"
                        
                    current_course_list.append(course_info)
                    print(f"  - Course {j+1}: Name='{course_info['name']}', Duration='{course_info['duration']}'...")
            except TimeoutException:
                print("Timeout: No course cards found after accordion expansion/check.")
            except Exception as e:
                print(f"Error iterating and scraping individual course cards: {e}")
        else:
            print("Skipping course card scraping as the parent section was not found.")

    except Exception as e:
        print(f"An unexpected error occurred during course section handling for {url}: {e}")
    
    scraped_data['courses'] = current_course_list # Assign current courses to scraped_data

    # 5. Scrape Tags
    current_tags = [] # Store tags for the current URL
    try:
        print("\n--- Attempting to scrape Tags section ---")
        tags_section_header = wait.until(EC.presence_of_element_located(
            (By.XPATH, "//div[contains(@class, 'TagGroup_tagGroupContainer__')]")
        ))
        driver.execute_script("arguments[0].scrollIntoView(true);", tags_section_header)
        print("Scrolled Tags section into view.")
        time.sleep(2)

        tag_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'TagGroup_tagGroupContainer__')]//div[contains(@class, 'TagLabel_labelContainer__')]/span[not(contains(@class, 'sr-only')) and not(contains(@class, 'TagLabel_chevronIcon__'))]")
        
        for tag_el in tag_elements:
            tag_text = tag_el.text.strip()
            if tag_text:
                current_tags.append(tag_text)
        print(f"Scraped Tags: {current_tags}")
    except NoSuchElementException:
        print("Could not find Tags section or individual tags.")
    except Exception as e:
        print(f"Error scraping tags for {url}: {e}")
    scraped_data['tags'] = current_tags # Assign current tags to scraped_data


    # 6. Scrape Overall Rating & Reviews
    scraped_data['overall_rating_reviews_section'] = "N/A"
    scraped_data['num_learners_rated_reviews_section'] = "N/A"
    current_rating_breakdown = {} # Store breakdown for the current URL
    current_comments = [] # Store comments for the current URL

    try:
        print("\n--- Attempting to scrape Ratings & Reviews section ---")
        ratings_header_element = wait.until(EC.presence_of_element_located(
            (By.XPATH, "//h2[contains(@class, 'RatingsAndComments_title__')]")
        ))
        driver.execute_script("arguments[0].scrollIntoView(true);", ratings_header_element)
        print("Scrolled Ratings & Reviews section into view.")
        time.sleep(2)

        try:
            overall_rating_text_element = wait.until(EC.presence_of_element_located(
                (By.XPATH, "//div[contains(@class, 'RatingSummary_description__Dthqd')]/span")
            ))
            scraped_data['overall_rating_reviews_section'] = overall_rating_text_element.text.strip()
            rating_match = re.search(r'([\d\.]+) out of 5', scraped_data['overall_rating_reviews_section'])
            if rating_match:
                scraped_data['overall_average_rating_reviews_section'] = rating_match.group(1)
            else:
                scraped_data['overall_average_rating_reviews_section'] = "N/A"
            print(f"Scraped Overall Average Rating: {scraped_data['overall_average_rating_reviews_section']}")

        except NoSuchElementException:
            print("Could not find overall 'X out of 5' rating text.")
            scraped_data['overall_rating_reviews_section'] = "N/A"
            scraped_data['overall_average_rating_reviews_section'] = "N/A"

        try:
            num_learners_rated_element = wait.until(EC.presence_of_element_located(
                (By.XPATH, "//div[contains(@class, 'RatingSummary_averageRating__8z9lv')]/span")
            ))
            scraped_data['num_learners_rated_reviews_section'] = num_learners_rated_element.text.strip()
            print(f"Scraped Number of Learners Rated: {scraped_data['num_learners_rated_reviews_section']}")
        except NoSuchElementException:
            print("Could not find 'Average rating by X learners' text.")
            scraped_data['num_learners_rated_reviews_section'] = "N/A"

        rating_breakdown_bars = driver.find_elements(By.XPATH, "//div[contains(@class, 'RatingBreakDown_container__UuLvg')]")
        if rating_breakdown_bars:
            for bar in rating_breakdown_bars:
                try:
                    star_rating_div = bar.find_element(By.XPATH, ".//div[contains(@class, 'RatingBreakDown_starsAndTitle__EL5nE')]//div[contains(@class, 'Stars_starRating__')]")
                    star_count = star_rating_div.get_attribute('title')
                    review_count_element = bar.find_element(By.XPATH, ".//div[contains(@class, 'RatingBreakDown_starsAndTitle__EL5nE')]/span[not(contains(@class, 'Stars_starRating__'))]")
                    review_count = review_count_element.text.strip() if review_count_element.text else "N/A reviews"
                    if star_count and review_count:
                        current_rating_breakdown[star_count] = review_count
                except NoSuchElementException:
                    continue
                except Exception as e:
                    print(f"Error scraping a specific rating bar: {e}. Skipping this bar.")
                    continue
        print(f"Scraped Rating Breakdown: {current_rating_breakdown}")
        scraped_data['rating_breakdown'] = current_rating_breakdown

        # --- SCRAPING INDIVIDUAL COMMENTS ---
        print("\n--- Attempting to scrape individual comments ---")
        
        try:
            comments_section_parent = wait.until(EC.presence_of_element_located(
                (By.XPATH, "//div[@id='YL_ratings_and_comments_component']")
            ))
            comments_list_container = comments_section_parent.find_element(By.XPATH, ".//div[contains(@class, 'Comments_commentsContainer__')]")
            print("Found comments list container.")

            while True:
                try:
                    load_more_button_locator = (By.XPATH, "//button[contains(text(), 'Load more comments')]")
                    load_more_button = wait.until(EC.element_to_be_clickable(load_more_button_locator))
                    driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
                    print("Clicking 'Load more comments'...")
                    load_more_button.click()
                    time.sleep(3)
                except TimeoutException:
                    print("No more 'Load more comments' button found or all comments loaded.")
                    break
                except StaleElementReferenceException:
                    print("Stale 'Load more comments' button, retrying...")
                    time.sleep(1)
                    continue

            comment_elements_locator = (By.XPATH, "//div[contains(@class, 'Comment_commentContainer__caKxU')]")
            all_comments_on_page = wait.until(EC.presence_of_all_elements_located(comment_elements_locator))
            print(f"Found {len(all_comments_on_page)} individual comments for {url}.")

            for k, comment_el in enumerate(all_comments_on_page):
                comment_data = {}
                comment_data['parent_course_url'] = url # Link comment to its parent URL
                comment_data['parent_course_title'] = scraped_data.get('learning_plan_title', 'N/A')
                try:
                    comment_data['reviewer_name'] = comment_el.find_element(By.XPATH, ".//div[contains(@class, 'ProfileIcon_container__NIYfG')]//img").get_attribute('alt').strip()
                except NoSuchElementException:
                    comment_data['reviewer_name'] = "N/A"

                try:
                    comment_data['review_date'] = comment_el.find_element(By.XPATH, ".//div[contains(@class, 'Comment_date__ZCDZK')]").text.strip()
                except NoSuchElementException:
                    comment_data['review_date'] = "N/A"
                
                try:
                    comment_data['comment_title'] = comment_el.find_element(By.XPATH, ".//h3[contains(@class, 'Comment_title__')]").text.strip()
                except NoSuchElementException:
                    comment_data['comment_title'] = "N/A"

                try:
                    comment_data['comment_text'] = comment_el.find_element(By.XPATH, ".//div[contains(@class, 'Comment_content__uYufg')]").text.strip()
                except NoSuchElementException:
                    comment_data['comment_text'] = "N/A"

                try:
                    comment_rating_sr_only = comment_el.find_element(By.XPATH, ".//div[contains(@class, 'Stars_starRating__')]//span[contains(@class, 'sr-only')]").text.strip()
                    rating_match = re.search(r'rated ([\d\.]+) out of 5 stars', comment_rating_sr_only)
                    if rating_match:
                        comment_data['comment_rating'] = rating_match.group(1)
                    else:
                        comment_data['comment_rating'] = comment_rating_sr_only
                except NoSuchElementException:
                    comment_data['comment_rating'] = "N/A"

                current_comments.append(comment_data)
                print(f"  - Comment {k+1} by {comment_data['reviewer_name']}: Rating='{comment_data['comment_rating']}', Date='{comment_data['review_date']}'...")

        except TimeoutException:
            print("Timeout: Could not find the comments section or individual comment elements.")
        except Exception as e:
            print(f"An unexpected error occurred during individual comment scraping for {url}: {e}")
        scraped_data['comments'] = current_comments # Assign current comments to scraped_data

    except NoSuchElementException:
        print("Could not find main Ratings & Reviews section parent.")
    except Exception as e:
        print(f"An unexpected error occurred during overall ratings and reviews section processing for {url}: {e}")
    
    # Append the scraped data for the current URL to the overall lists
    # General Info
    all_general_info_data.append({
        'Course URL': scraped_data.get('course_url'),
        'Learning Plan Title': scraped_data.get('learning_plan_title'),
        'Overall Duration': scraped_data.get('overall_duration'),
        'Total Learners (Numeric)': scraped_data.get('overall_learners_amount_numeric'),
        'Total Learners (Text)': scraped_data.get('overall_learners_amount_text'),
        'Overall Average Rating (Header)': scraped_data.get('overall_average_rating_from_header'),
        'Overall Learners Rated (Header)': scraped_data.get('overall_learners_rated_from_header'),
        'About Learning Plan': scraped_data.get('about_learning_plan'),
        'Total Courses Required Text': scraped_data.get('total_courses_required_text'),
        'Overall Rating (Reviews Section)': scraped_data.get('overall_average_rating_reviews_section'),
        'Number of Learners Rated (Reviews Section)': scraped_data.get('num_learners_rated_reviews_section'),
        'Tags': ", ".join(scraped_data.get('tags', []))
    })

    # Courses (each course within a plan is a row)
    for course_item in scraped_data['courses']:
        all_courses_data.append(course_item)

    # Comments (each comment is a row)
    for comment_item in scraped_data['comments']:
        all_comments_data.append(comment_item)

    # Rating Breakdown (each breakdown item is a row, linked to the course URL)
    if scraped_data['rating_breakdown']:
        for stars, count in scraped_data['rating_breakdown'].items():
            all_rating_breakdown_data.append({
                'Course URL': scraped_data.get('course_url'),
                'Learning Plan Title': scraped_data.get('learning_plan_title'),
                'Stars': stars,
                'Review Count': count
            })

    print(f"\n--- Finished scraping for {scraped_data.get('learning_plan_title', url)} ---")


# --- Convert to DataFrame and Save All Collected Data ---
print("\n--- Consolidating and Saving All Scraped Data ---")

# 1. Create a single DataFrame for general learning plan info from all URLs
if all_general_info_data:
    final_general_info_df = pd.DataFrame(all_general_info_data)
    final_general_info_df.to_csv('ds_general_info.csv', index=False)
    print("\nSaved 'ds_general_info.csv'")
else:
    print("No general info data to save.")

# 2. Create a single DataFrame for Courses from all URLs
if all_courses_data:
    final_courses_df = pd.DataFrame(all_courses_data)
    final_courses_df.to_csv('ds_courses.csv', index=False)
    print("Saved 'ds_courses.csv'")
else:
    print("No course data to save.")

# 3. Create a single DataFrame for Comments from all URLs
if all_comments_data:
    final_comments_df = pd.DataFrame(all_comments_data)
    final_comments_df.to_csv('ds_comments.csv', index=False)
    print("Saved 'ds_comments.csv'")
else:
    print("No comment data to save.")

# Keep the browser open for a few seconds, then close
time.sleep(10)
driver.quit()

Navigating to initial IBM SkillsBuild login page...
Attempting to handle cookie consent banner...
Clicked 'Accept all' on cookie banner.
Attempting to click 'Log in with Email' button...
Clicked 'Log in with Email' button.


Please enter your email:  zo24176@bristol.ac.uk
Please enter your password:  IBM24176Bristol$


Attempting to find and fill email field...
Email entered.
Attempting to find and fill password field...
Password entered.
Attempting to find and click submit button...
Submit button clicked.
Login process complete. Proceeding to target pages.

--- Scraping Course 1 of 4: https://skills.yourlearning.ibm.com/activity/PLAN-14F2691E3A32?ngo-id=0302 ---
Attempting to handle cookie consent banner...
Clicked 'Accept all' on cookie banner.
Activity page loaded successfully.
Scraped Title: Getting Started with Data (Earn a credential!)
Scraped Overall Duration: About 3 hours
Scraped Overall Learners: 5860 learners have completed this activity in the past 12 months. (Numeric: 5860)
Scraped Overall Rating (Header): Average rating of 4.5 stars by 176 learners in the past 12 months.
Scraped 'About' description: Are you curious to discover the type of insights we can extract from data? Organizations use data to...

--- Attempting to scrape Courses section ---
Waiting for courses section parent 'lear

In [13]:
import pandas as pd
import re # Import the regular expression module

print("--- Starting data cleaning process ---")

# --- 1. Clean ds_general_info.csv ---
try:
    df_general_info = pd.read_csv('ds_general_info.csv')
    print("\nCleaning 'ds_general_info.csv'...")

    # Overall duration – extract only the number (e.g., 10 from "About 10 hours")
    def extract_numeric_duration(duration_str):
        if pd.isna(duration_str):
            return None
        # Use regex to find one or more digits (\d+) in the string
        match = re.search(r'(\d+)', str(duration_str))
        if match:
            return float(match.group(1)) # Convert the found number to float
        return None # Return None if no number is found

    df_general_info['Overall Duration (in hours)'] = df_general_info['Overall Duration'].apply(extract_numeric_duration)
    print("  - Applied 'Overall Duration' cleaning (extracted numeric part).")


    # Delete columns: total learners(text), overall rating (reviews section), Number of Learners Rated (Reviews Section)
    # AND now also 'Total Courses Required Text' and the original 'Overall Duration'
    columns_to_delete_general = [
        'Total Learners (Text)',
        'Overall Rating (Reviews Section)',
        'Number of Learners Rated (Reviews Section)',
        'Total Courses Required Text', # Added for deletion
        'Overall Duration' # Added for deletion to replace with 'Overall Duration (Cleaned)'
    ]
    df_general_info.drop(columns=columns_to_delete_general, errors='ignore', inplace=True)
    print(f"  - Deleted columns from general info: {columns_to_delete_general}")

    # Save cleaned general info
    df_general_info.to_csv('ds_general_info_cleaned.csv', index=False)
    print("  - Saved 'ds_general_info_cleaned.csv'")

except FileNotFoundError:
    print("Error: 'ds_general_info.csv' not found. Skipping cleaning for this file.")
except Exception as e:
    print(f"An error occurred while cleaning 'ds_general_info.csv': {e}")


# --- 2. Clean ds_courses.csv ---
try:
    df_courses = pd.read_csv('ds_courses.csv')
    print("\nCleaning 'ds_courses.csv'...")

    # Delete columns: Learners_text, rating_text
    columns_to_delete_courses = [
        'Learners_text',
        'rating_text'
    ]
    df_courses.drop(columns=columns_to_delete_courses, errors='ignore', inplace=True)
    print(f"  - Deleted columns from courses: {columns_to_delete_courses}")

    # Save cleaned courses
    df_courses.to_csv('ds_courses_cleaned.csv', index=False)
    print("  - Saved 'ds_courses_cleaned.csv'")

except FileNotFoundError:
    print("Error: 'ds_courses.csv' not found. Skipping cleaning for this file.")
except Exception as e:
    print(f"An error occurred while cleaning 'ds_courses.csv': {e}")


# --- 3. Clean ds_comments.csv ---
try:
    df_comments = pd.read_csv('ds_comments.csv')
    print("\nCleaning 'ds_comments.csv'...")

    # Delete column comment_title
    columns_to_delete_comments = [
        'comment_title'
    ]
    df_comments.drop(columns=columns_to_delete_comments, errors='ignore', inplace=True)
    print(f"  - Deleted columns from comments: {columns_to_delete_comments}")

    # Save cleaned comments
    df_comments.to_csv('ds_comments_cleaned.csv', index=False)
    print("  - Saved 'ds_comments_cleaned.csv'")

except FileNotFoundError:
    print("Error: 'ds_comments.csv' not found. Skipping cleaning for this file.")
except Exception as e:
    print(f"An error occurred while cleaning 'ds_comments.csv': {e}")

print("\n--- Data cleaning process complete ---")

--- Starting data cleaning process ---

Cleaning 'ds_general_info.csv'...
  - Applied 'Overall Duration' cleaning (extracted numeric part).
  - Deleted columns from general info: ['Total Learners (Text)', 'Overall Rating (Reviews Section)', 'Number of Learners Rated (Reviews Section)', 'Total Courses Required Text', 'Overall Duration']
  - Saved 'ds_general_info_cleaned.csv'

Cleaning 'ds_courses.csv'...
  - Deleted columns from courses: ['Learners_text', 'rating_text']
  - Saved 'ds_courses_cleaned.csv'

Cleaning 'ds_comments.csv'...
  - Deleted columns from comments: ['comment_title']
  - Saved 'ds_comments_cleaned.csv'

--- Data cleaning process complete ---
