In [None]:
"""
StudyBuddy Hub Resource Scraping Notebook
Author: Sarah Kayembe
Course: COS 557 Database Systems (Phase II)
Date: November 10, 2025

Project Purpose
This notebook demonstrates the process of scraping publicly available educational resources for integration into the StudyBuddy Hub system.
The data collected will support the Course Resources module, allowing students and instructors to:
- Access verified learning materials (articles, videos, exercises).
- Contribute their own study resources.
- Support gamified learning and course management features.

Technologies Used
- Python (Requests, BeautifulSoup, Pandas)
- Khan Academy Open API / GraphQL endpoints
- CSV output for MySQL data import
"""

In [20]:
# Scrapes the GeeksforGeeks "Data Structures" page to extract
# topic titles, short descriptions, and resource URLs. The data is structured
# into a CSV file for integration into the resources table.

import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import os

def scrape_introduction_to_ds(url: str) -> pd.DataFrame:
    """
    Scrapes the target GeeksforGeeks 'Introduction to Data Structures' page
    using the precise HTML selectors provided for topic links and descriptions.
    """
    print(f"-> Starting precise scrape on: {url}")

    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    scraped_topics: List[Dict[str, str]] = []

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Selector Strategy
        # Find all <a> tags that are main topic links (rel="noopener" and contain bold text).
        topic_links = soup.find_all('a', rel="noopener")

        # Iterate through links and find the immediate following description paragraph
        for topic_link in topic_links:
            # Only process links that are part of the core topic structure (contain bold text)
            if not topic_link.find('b'):
                continue

            # Extract Topic Name and URL
            topic_name = topic_link.get_text(strip=True)
            resource_url = topic_link.get('href')

            # Find the Description (<p dir="ltr">)
            description_snippets = []

            # The description follows the link's container (either <h2> or a standalone element)
            start_element = topic_link.parent if topic_link.parent.name == 'h2' else topic_link

            sibling = start_element.next_sibling

            # Find the first <p dir="ltr"> sibling
            while sibling:
                if sibling.name == 'p' and sibling.get('dir') == 'ltr':
                    text = sibling.get_text(strip=True)
                    if text:
                        description_snippets.append(text)
                    break

                # Stop if we hit the next major section marker
                if sibling.name == 'h2':
                    break

                sibling = sibling.next_sibling

            content_snippet = ' '.join(description_snippets)

            if topic_name and content_snippet:
                scraped_topics.append({
                    'CourseName': 'Data Structures',
                    'TopicName': topic_name,
                    'ContentSnippet': content_snippet,
                    'ResourceURL': resource_url,
                    'ResourceType': 'LINK'
                })

    except requests.exceptions.RequestException as e:
        print(f" ERROR: Failed to fetch {url}. Reason: {e}")

    df = pd.DataFrame(scraped_topics)
    return df

# EXECUTION
TARGET_URL = "https://www.geeksforgeeks.org/dsa/introduction-to-data-structures/"

# Run the precise scrape function
df_topics = scrape_introduction_to_ds(TARGET_URL)

# Display the results
if not df_topics.empty:
    # Define output path outside the 'scrapers' folder
    output_dir = os.path.join("..", "data", "Raw_data")
    os.makedirs(output_dir, exist_ok=True)  # ensure the directory exists
    
    output_filename = "course_resources.csv"
    output_path = os.path.join(output_dir, output_filename)

    # Save file to data/Raw_data/
    df_topics.to_csv(output_path, index=False, encoding='utf-8')
    print(f"\n Data saved successfully to {output_path}")
else:
    print("\n No topics were successfully scraped.")

-> Starting precise scrape on: https://www.geeksforgeeks.org/dsa/introduction-to-data-structures/

 Data saved successfully to ../data/Raw_data/course_resources.csv


In [21]:
# Scrapes a GeeksforGeeks article and creates a manual PDF entry for 'Algorithms in Programming',
# appending both to the course resources CSV.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from typing import List, Dict
from pathlib import Path

# Configuration

scrapers_dir = Path(__file__).resolve().parent if "__file__" in locals() else Path(os.getcwd())
project_root = scrapers_dir.parent

# Step 2: Build output directory path inside data/Raw_data/
output_dir = project_root / "data" / "Raw_data"
output_dir.mkdir(parents=True, exist_ok=True)

# Step 3: Define full CSV save path
CSV_FILENAME = "course_resources.csv"
CSV_PATH = output_dir / CSV_FILENAME

# Source URLs
ALGORITHMS_GfG_URL = 'https://www.geeksforgeeks.org/dsa/introduction-to-algorithms/'
ALGORITHMS_PDF_URL = 'https://www.montclair.edu/computer-science-education/wp-content/uploads/sites/253/2024/02/3-5-8.1.5-Algorithms-Programming.pdf'
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
COURSE_NAME = 'Algorithms in Programming'

def scrape_gfg_main_article_fixed(url: str) -> Dict[str, str]:
    """
    Scrapes the main title and the first descriptive paragraph using the
    HTML structure provided by the user.
    """
    print(f"-> Scraping main article with fixed selectors: {url}")

    topic_data = {
        'CourseName': COURSE_NAME,
        'TopicName': 'Scrape Failed: Introduction to Algorithms',
        'ContentSnippet': 'Scrape failed. Check selectors or URL.',
        'ResourceURL': url,
        'ResourceType': 'LINK'
    }

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract Main Title (TopicName) using the corrected selector
        # Title is inside <h1> within <div class="article-title">
        title_container = soup.find('div', class_='article-title')
        if title_container and title_container.find('h1'):
            topic_data['TopicName'] = title_container.find('h1').get_text(strip=True)

        # Extract First Descriptive Paragraph (ContentSnippet) using the corrected selector
        # Description is inside the first <p dir="ltr"> within <div class="text">
        content_container = soup.find('div', class_='text')

        if content_container:
            first_paragraph = content_container.find('p', dir='ltr')
            if first_paragraph:
                snippet = first_paragraph.get_text(strip=True)
                # Clean up the snippet to look like a summary
                cleaned_snippet = snippet.split(" Or ")[0].replace("The word Algorithm means", "An algorithm is defined as").replace("\"", "").replace("'", "")
                topic_data['ContentSnippet'] = cleaned_snippet[:250] + "..." if len(cleaned_snippet) > 250 else cleaned_snippet
            else:
                topic_data['ContentSnippet'] = "Main description paragraph not found in <div class='text'>."

    except Exception as e:
        print(f" ERROR scraping {url}: {e}")

    return topic_data

def create_pdf_entry(url: str) -> Dict[str, str]:
    """Creates a manual entry for the PDF resource."""
    print(f"-> Creating manual entry for PDF: {url}")

    return {
        'CourseName': COURSE_NAME,
        'TopicName': 'Algorithms and Programming Concepts (PDF)',
        'ContentSnippet': 'A comprehensive, multi-page PDF document covering core concepts in algorithms, programming, and computer science.',
        'ResourceURL': url,
        'ResourceType': 'PDF'
    }

def append_to_csv(new_data: List[Dict[str, str]], filename: str):
    """Reads existing data, appends new data, and saves back to the file."""

    df_new = pd.DataFrame(new_data)

    # Read existing data, or create an empty DataFrame if file doesn't exist
    if os.path.exists(filename):
        df_existing = pd.read_csv(filename)
        print(f" Found existing file '{filename}' with {len(df_existing)} entries.")
    else:
        # If the file doesn't exist (e.g., first run), create a new one with the expected columns
        df_existing = pd.DataFrame(columns=df_new.columns)
        print(f" Creating new file '{filename}'.")

    # Ensure consistent columns before concatenating
    df_existing = df_existing.reindex(columns=df_new.columns, fill_value='')

    # Append new data
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)

    # Save the combined data back to the original file
    df_combined.to_csv(filename, index=False, encoding='utf-8')

    print("\n" + "="*70)
    print(f" Data successfully appended to '{filename}'.")
    print(f"Total entries now: {len(df_combined)}")
    print("="*70)


# MAIN EXECUTION
if __name__ == "__main__":
    # Gather new data
    new_entries = []

    # Scrape GfG article with fixed selectors
    gfg_entry = scrape_gfg_main_article_fixed(ALGORITHMS_GfG_URL)
    new_entries.append(gfg_entry)

    # Create PDF entry
    pdf_entry = create_pdf_entry(ALGORITHMS_PDF_URL)
    new_entries.append(pdf_entry)

    # Append to the existing CSV file
    append_to_csv(new_entries, CSV_PATH)

-> Scraping main article with fixed selectors: https://www.geeksforgeeks.org/dsa/introduction-to-algorithms/
-> Creating manual entry for PDF: https://www.montclair.edu/computer-science-education/wp-content/uploads/sites/253/2024/02/3-5-8.1.5-Algorithms-Programming.pdf
 Found existing file '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv' with 10 entries.

 Data successfully appended to '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv'.
Total entries now: 12


In [22]:
# Scrapes the GeeksforGeeks 'Web Technology' article, extracting the main topic
# and sub-topics (Frontend/Backend) into separate entries, and appends the data to the course resources CSV file.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from typing import List, Dict

# Configuration

scrapers_dir = Path(__file__).resolve().parent if "__file__" in locals() else Path(os.getcwd())
project_root = scrapers_dir.parent

# Build output directory path inside data/Raw_data/
output_dir = project_root / "data" / "Raw_data"
output_dir.mkdir(parents=True, exist_ok=True)

# Define full CSV save path
CSV_FILENAME = "course_resources.csv"
CSV_PATH = output_dir / CSV_FILENAME

# Source URL
WEB_TECH_URL = 'https://www.geeksforgeeks.org/web-tech/web-technology/'
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
COURSE_NAME = 'Web Applications Development'

def scrape_web_tech_article(url: str) -> List[Dict[str, str]]:
    """
    Scrapes the main topic, Backend, and Frontend sections from the GeeksforGeeks Web Technology URL.
    """
    new_entries = []

    print(f"-> Starting scrape for Web Applications Development: {url}")

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Main Topic Entry
        # Get Main Title
        main_title = "Web Technology | Complete Overview"
        title_element = soup.find('h1', class_='main-heading') or soup.find('div', class_='article-title').find('h1')
        if title_element:
            main_title = title_element.get_text(strip=True)

        # Get Main Description Snippet (first paragraph in the main text container)
        content_container = soup.find('div', class_='text')
        main_snippet = "Definition of Web Technology not clearly found."

        if content_container:
            first_paragraph = content_container.find('p', dir='ltr')
            if first_paragraph:
                main_snippet = first_paragraph.get_text(strip=True)
                main_snippet = main_snippet[:250].strip() + "..." if len(main_snippet) > 250 else main_snippet

        new_entries.append({
            'CourseName': COURSE_NAME,
            'TopicName': main_title,
            'ContentSnippet': main_snippet,
            'ResourceURL': url,
            'ResourceType': 'LINK'
        })


        # Backend Development Sub-Topic Entry
        backend_h2 = soup.find('h2', id='backend-development')
        if backend_h2:
            backend_topic = backend_h2.get_text(strip=True).replace(':', '')
            backend_p = backend_h2.find_next_sibling('p')
            backend_snippet = backend_p.get_text(strip=True) if backend_p else "Description not found for Backend Development."
            backend_snippet = backend_snippet[:250].strip() + "..." if len(backend_snippet) > 250 else backend_snippet

            new_entries.append({
                'CourseName': COURSE_NAME,
                'TopicName': backend_topic,
                'ContentSnippet': backend_snippet,
                'ResourceURL': url + "#backend-development",
                'ResourceType': 'LINK'
            })

        # Frontend Development Sub-Topic Entry
        frontend_h2 = soup.find('h2', id='frontend-development')
        if frontend_h2:
            frontend_topic = frontend_h2.get_text(strip=True).replace(':', '')
            frontend_p = frontend_h2.find_next_sibling('p')
            frontend_snippet = frontend_p.get_text(strip=True) if frontend_p else "Description not found for Frontend Development."
            frontend_snippet = frontend_snippet[:250].strip() + "..." if len(frontend_snippet) > 250 else frontend_snippet

            new_entries.append({
                'CourseName': COURSE_NAME,
                'TopicName': frontend_topic,
                'ContentSnippet': frontend_snippet,
                'ResourceURL': url + "#frontend-development",
                'ResourceType': 'LINK'
            })

        if not backend_h2 and not frontend_h2:
            print(" WARNING: Neither Backend nor Frontend Development sections were found.")

    except requests.exceptions.RequestException as e:
        print(f" ERROR: Failed to fetch {url}. Reason: {e}")
    except Exception as e:
        print(f" ERROR: An unexpected error occurred: {e}")

    return new_entries

def append_to_csv(new_data: List[Dict[str, str]], filename: str):
    """Reads existing data, appends new data, and saves back to the file."""

    if not new_data:
        print("\n No new data scraped. File not updated.")
        return

    df_new = pd.DataFrame(new_data)

    # Define required columns to ensure consistency
    REQUIRED_COLUMNS = ['CourseName', 'TopicName', 'ContentSnippet', 'ResourceURL', 'ResourceType']

    # Read existing data, or create an empty DataFrame if file doesn't exist
    if os.path.exists(filename):
        df_existing = pd.read_csv(filename)
        print(f" Found existing file '{filename}' with {len(df_existing)} entries.")
    else:
        # Create a new DataFrame with the required columns
        df_existing = pd.DataFrame(columns=REQUIRED_COLUMNS)
        print(f" Creating new file '{filename}'.")

    # Ensure consistent columns before concatenating
    df_existing = df_existing.reindex(columns=REQUIRED_COLUMNS, fill_value='')

    # Append new data
    df_combined = pd.concat([df_existing, df_new.reindex(columns=REQUIRED_COLUMNS)], ignore_index=True)

    # Save the combined data back to the original file
    df_combined.to_csv(filename, index=False, encoding='utf-8')

    print("\n" + "="*70)
    print(f" Data successfully appended to '{filename}'.")
    print(f"Total entries now: {len(df_combined)}")
    print("="*70)

# MAIN EXECUTION
if __name__ == "__main__":
    # Gather new data
    web_tech_entries = scrape_web_tech_article(WEB_TECH_URL)

    # Append to the existing CSV file
    append_to_csv(web_tech_entries, CSV_PATH)

-> Starting scrape for Web Applications Development: https://www.geeksforgeeks.org/web-tech/web-technology/
 Found existing file '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv' with 12 entries.

 Data successfully appended to '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv'.
Total entries now: 15


In [23]:
## Scrapes the W3Schools Python tutorial sidebar to find all course links,
# then scrapes each page for its title and first sentence description, appending all data to the course resources CSV.

import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import os
import re
from typing import List, Dict

# Configuration

scrapers_dir = Path(__file__).resolve().parent if "__file__" in locals() else Path(os.getcwd())
project_root = scrapers_dir.parent

# Build output directory path inside data/Raw_data/
output_dir = project_root / "data" / "Raw_data"
output_dir.mkdir(parents=True, exist_ok=True)

# Define full CSV save path
CSV_FILENAME = "course_resources.csv"
CSV_PATH = output_dir / CSV_FILENAME

# Source URLs
START_URL = "https://www.w3schools.com/python/python_intro.asp"
BASE_URL_PATH = "https://www.w3schools.com/python/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
COURSE_NAME = "Python Programming"
REQUIRED_COLUMNS = ['CourseName', 'TopicName', 'ResourceURL', 'ContentSnippet', 'ResourceType']

# Core Functions
def get_all_course_links(start_url: str) -> List[str]:
    """Scrapes the starting page to find all tutorial links in the sidebar navigation."""
    print(f" Fetching all course links from: {start_url}")
    link_urls = set()

    try:
        response = requests.get(start_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        nav_container = soup.find(id='leftmenuinnerinner')

        if nav_container:
            links = nav_container.find_all('a', href=True)
            for link in links:
                relative_url = link['href']
                if relative_url.endswith('.asp'):
                    full_url = urljoin(BASE_URL_PATH, relative_url)
                    link_urls.add(full_url)
        else:
            print(" ERROR: Could not find the main navigation container.")

    except requests.exceptions.RequestException as e:
        print(f" ERROR: Failed to fetch links from {start_url}. Reason: {e}")

    sorted_urls = sorted(list(link_urls))
    print(f" Found {len(sorted_urls)} unique tutorial pages.")
    return sorted_urls

def scrape_page_details(url: str) -> Dict[str, str]:
    """Visits a single URL and extracts the main title and full first sentence for description."""
    data = {
        'CourseName': COURSE_NAME,
        'TopicName': "Title Not Found",
        'ResourceURL': url,
        'ContentSnippet': "Content Not Found",
        'ResourceType': 'LINK'
    }

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract Topic Name (Title)
        main_h1 = soup.find('div', id='main').find('h1')
        if main_h1:
            data['TopicName'] = main_h1.get_text(strip=True).split('Example')[0].split('Tutorial')[0].strip()

        # Extract Description Snippet (Full Sentences)
        main_content_div = soup.find('div', id='main')
        if main_content_div:
            # Find the first few non-empty paragraphs
            first_p = main_content_div.find('p', recursive=True)

            if first_p and first_p.get_text(strip=True):
                raw_text = first_p.get_text(strip=True)

                # Use a regular expression to find sentence-ending punctuation (., !, ?)
                # followed by whitespace or end of string, and capture the text up to that point.
                # This ensures we get a complete sentence.
                match = re.search(r'([.!?])\s+|\Z', raw_text)

                if match:
                    # Capture the text up to and including the punctuation, then strip whitespace.
                    end_index = match.end() if match.group(1) else len(raw_text)
                    snippet = raw_text[:end_index].strip()
                else:
                    # If no punctuation is found, just take the raw text (e.g., if it's a very short line)
                    snippet = raw_text.strip()

                data['ContentSnippet'] = snippet[:250] + "..." if len(snippet) > 250 else snippet

    except requests.exceptions.RequestException as e:
        data['TopicName'] = f"ERROR: Fetch Failed"
        data['ContentSnippet'] = str(e)
    except Exception as e:
        data['TopicName'] = f"ERROR: Scrape Failed"
        data['ContentSnippet'] = str(e)

    return data

def append_to_csv(new_data: List[Dict[str, str]], filename: str):
    """Reads existing data, appends new data, and saves back to the file."""

    if not new_data:
        print("\n No new data scraped. File not updated.")
        return

    df_new = pd.DataFrame(new_data)

    # Read existing data, or create an empty DataFrame if file doesn't exist
    if os.path.exists(filename):
        # We need to explicitly handle the columns to avoid issues if the original file structure changed
        try:
            df_existing = pd.read_csv(filename)
        except Exception as e:
            print(f" WARNING: Failed to read existing CSV (Error: {e}). Starting a new file.")
            df_existing = pd.DataFrame(columns=REQUIRED_COLUMNS)
        print(f" Found existing file '{filename}' with {len(df_existing)} entries.")
    else:
        df_existing = pd.DataFrame(columns=REQUIRED_COLUMNS)
        print(f" Creating new file '{filename}'.")

    # Ensure consistent columns before concatenating
    df_existing = df_existing.reindex(columns=REQUIRED_COLUMNS, fill_value='')

    # Append new data
    df_combined = pd.concat([df_existing, df_new.reindex(columns=REQUIRED_COLUMNS)], ignore_index=True)

    # Save the combined data back to the original file
    df_combined.to_csv(filename, index=False, encoding='utf-8')

    print("\n" + "="*80)
    print(f" Scraping complete! Data successfully appended to '{filename}'.")
    print(f"Total entries now: {len(df_combined)}")
    print("="*80)

# MAIN EXECUTION
if __name__ == "__main__":
    # 1. Gather new data
    all_urls = get_all_course_links(START_URL)
    scraped_data = []

    if all_urls:
        print(f"\n Starting scrape of {len(all_urls)} pages...")
        for i, url in enumerate(all_urls):
            # print(f"   -> Scraping page {i + 1}/{len(all_urls)}")
            details = scrape_page_details(url)
            scraped_data.append(details)
            time.sleep(0.1) # Politeness factor, slightly reduced for speed.

    # Append to the existing CSV file
    append_to_csv(scraped_data, CSV_PATH)

 Fetching all course links from: https://www.w3schools.com/python/python_intro.asp
 Found 232 unique tutorial pages.

 Starting scrape of 232 pages...
 Found existing file '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv' with 15 entries.

 Scraping complete! Data successfully appended to '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv'.
Total entries now: 247


In [24]:
# Scrapes the W3Schools Java tutorial sidebar to find all course links, then scrapes
# each page for its title and first sentence description, appending all data to the course resources CSV.

import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import os
import re
from typing import List, Dict

# Configuration
scrapers_dir = Path(__file__).resolve().parent if "__file__" in locals() else Path(os.getcwd())
project_root = scrapers_dir.parent

# Build output directory path inside data/Raw_data/
output_dir = project_root / "data" / "Raw_data"
output_dir.mkdir(parents=True, exist_ok=True)

# Define full CSV save path
CSV_FILENAME = "course_resources.csv"
CSV_PATH = output_dir / CSV_FILENAME

# Source URLs
START_URL = "https://www.w3schools.com/java/default.asp"
BASE_URL_PATH = "https://www.w3schools.com/java/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
COURSE_NAME = "Structured Problem Solving"
REQUIRED_COLUMNS = ['CourseName', 'TopicName', 'ResourceURL', 'ContentSnippet', 'ResourceType']

# Core Functions (Same reliable logic as before)
def get_all_course_links(start_url: str) -> List[str]:
    """Scrapes the starting page to find all tutorial links in the sidebar navigation."""
    print(f" Fetching all course links from: {start_url}")
    link_urls = set()

    try:
        response = requests.get(start_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # W3Schools uses this consistent ID for the navigation sidebar
        nav_container = soup.find(id='leftmenuinnerinner')

        if nav_container:
            links = nav_container.find_all('a', href=True)
            for link in links:
                relative_url = link['href']
                # Java tutorial links also end in .asp
                if relative_url.endswith('.asp'):
                    full_url = urljoin(BASE_URL_PATH, relative_url)
                    link_urls.add(full_url)
        else:
            print(" ERROR: Could not find the main navigation container.")

    except requests.exceptions.RequestException as e:
        print(f" ERROR: Failed to fetch links from {start_url}. Reason: {e}")

    sorted_urls = sorted(list(link_urls))
    print(f" Found {len(sorted_urls)} unique tutorial pages.")
    return sorted_urls

def scrape_page_details(url: str) -> Dict[str, str]:
    """Visits a single URL and extracts the main title and full first sentence for description."""
    data = {
        'CourseName': COURSE_NAME,
        'TopicName': "Title Not Found",
        'ResourceURL': url,
        'ContentSnippet': "Content Not Found",
        'ResourceType': 'LINK'
    }

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract Topic Name (Title)
        main_h1 = soup.find('div', id='main').find('h1')
        if main_h1:
            # Clean the title
            data['TopicName'] = main_h1.get_text(strip=True).split('Example')[0].split('Tutorial')[0].strip()

        # Extract Description Snippet (Full Sentences Logic)
        main_content_div = soup.find('div', id='main')
        if main_content_div:
            first_p = main_content_div.find('p', recursive=True)

            if first_p and first_p.get_text(strip=True):
                raw_text = first_p.get_text(strip=True)

                # Regex: Capture the first full sentence (non-greedy .*? followed by a sentence ender [.!?])
                match = re.search(r'(.*?([.!?]))\s*|\Z', raw_text)

                if match and match.group(1):
                    # match.group(1) is the entire sentence up to the punctuation.
                    snippet = match.group(1).strip()
                else:
                    # Fallback if no full stop is found
                    snippet = raw_text.strip()

                data['ContentSnippet'] = snippet[:250] if len(snippet) > 250 else snippet

    except requests.exceptions.RequestException as e:
        data['TopicName'] = f"ERROR: Fetch Failed"
        data['ContentSnippet'] = f"Request Error: {e}"

    return data

def append_to_csv(new_data: List[Dict[str, str]], filename: str):
    """Reads existing data, appends new data, and saves back to the file."""

    if not new_data:
        print("\n No new data scraped. File not updated.")
        return

    df_new = pd.DataFrame(new_data)

    # Read existing data or create a new DataFrame
    if os.path.exists(filename):
        try:
            df_existing = pd.read_csv(filename, header=0, encoding='utf-8')
        except Exception:
            # If read fails, assume header is needed for the first write
            df_existing = pd.DataFrame(columns=REQUIRED_COLUMNS)
        print(f" Found existing file '{filename}' with {len(df_existing)} entries.")
    else:
        df_existing = pd.DataFrame(columns=REQUIRED_COLUMNS)
        print(f" Creating new file '{filename}'.")

    # Ensure consistent columns before concatenating
    df_existing = df_existing.reindex(columns=REQUIRED_COLUMNS, fill_value='')
    df_new = df_new.reindex(columns=REQUIRED_COLUMNS, fill_value='')

    # Append new data
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)

    # Save the combined data back to the original file
    df_combined.to_csv(filename, index=False, encoding='utf-8')

    print("\n" + "="*80)
    print(f" Scraping complete! Java data successfully appended to '{filename}'.")
    print(f"Total entries now: {len(df_combined)}")
    print("="*80)

# MAIN EXECUTION
if __name__ == "__main__":
    # Gather new data
    all_urls = get_all_course_links(START_URL)
    scraped_data = []

    if all_urls:
        print(f"\n Starting scrape of {len(all_urls)} pages...")
        for i, url in enumerate(all_urls):
            if i % 20 == 0:
                 print(f"   -> Progress: {i}/{len(all_urls)}")
            details = scrape_page_details(url)
            scraped_data.append(details)
            time.sleep(0.05) # Be polite

    # Append to the existing CSV file
    append_to_csv(scraped_data, CSV_PATH)

 Fetching all course links from: https://www.w3schools.com/java/default.asp
 Found 433 unique tutorial pages.

 Starting scrape of 433 pages...
   -> Progress: 0/433
   -> Progress: 20/433
   -> Progress: 40/433
   -> Progress: 60/433
   -> Progress: 80/433
   -> Progress: 100/433
   -> Progress: 120/433
   -> Progress: 140/433
   -> Progress: 160/433
   -> Progress: 180/433
   -> Progress: 200/433
   -> Progress: 220/433
   -> Progress: 240/433
   -> Progress: 260/433
   -> Progress: 280/433
   -> Progress: 300/433
   -> Progress: 320/433
   -> Progress: 340/433
   -> Progress: 360/433
   -> Progress: 380/433
   -> Progress: 400/433
   -> Progress: 420/433
 Found existing file '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv' with 247 entries.

 Scraping complete! Java data successfully appended to '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv'.
Total entries now: 680


In [25]:
# Scrapes three diverse sources (Maine Legislature, APUS, edX) related to 'Criminal Law'
# to extract the title and a brief content snippet, appending the data to the course resources CSV.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import os
import re
from typing import List, Dict

# Configuration

scrapers_dir = Path(__file__).resolve().parent if "__file__" in locals() else Path(os.getcwd())
project_root = scrapers_dir.parent

# Build output directory path inside data/Raw_data/
output_dir = project_root / "data" / "Raw_data"
output_dir.mkdir(parents=True, exist_ok=True)

# Define full CSV save path
CSV_FILENAME = "course_resources.csv"
CSV_PATH = output_dir / CSV_FILENAME

# Source URLs
COURSE_NAME = "Criminal Law"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
REQUIRED_COLUMNS = ['CourseName', 'TopicName', 'ResourceURL', 'ContentSnippet', 'ResourceType']

# List of specific URLs provided by the user (excluding the Google Search link)
CRIMINAL_LAW_URLS = [
    "https://legislature.maine.gov/statutes/17-a/title17-ach0sec0.html",
    "https://www.apu.apus.edu/area-of-study/security-and-global-studies/resources/what-is-criminal-law-and-why-does-it-matter/",
    "https://www.edx.org/learn/criminal-law"
]

# Core Functions
def scrape_page_details(url: str) -> Dict[str, str]:
    """
    Visits a single URL and extracts the title and full first sentence for description,
    using generic selectors for diverse sources.
    """
    data = {
        'CourseName': COURSE_NAME,
        'TopicName': "Title Not Found",
        'ResourceURL': url,
        'ContentSnippet': "Content Not Found",
        'ResourceType': 'LINK'
    }

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract Topic Name (Title)
        title_element = soup.find(['h1', 'h2'])
        if title_element:
            data['TopicName'] = title_element.get_text(strip=True).strip()
        elif soup.title:
            # Fallback to the HTML <title> tag
            data['TopicName'] = soup.title.get_text(strip=True).split('|')[0].strip()

        # Extract Description Snippet (Full Sentences Logic)
        # Look for the main article content (using common container IDs/classes or just the body)
        content_containers = soup.find(['article', 'main'], class_=['content', 'entry-content', 'main-content']) or soup.body

        if content_containers:
            # Find the first few non-empty paragraphs in the content
            first_p = content_containers.find('p', recursive=True)

            if first_p and first_p.get_text(strip=True):
                raw_text = first_p.get_text(strip=True)

                # Regex: Capture the first full sentence (non-greedy .*? followed by a sentence ender [.!?])
                match = re.search(r'(.*?([.!?]))\s*|\Z', raw_text)

                if match and match.group(1):
                    snippet = match.group(1).strip()
                else:
                    snippet = raw_text.strip()

                data['ContentSnippet'] = snippet[:250] if len(snippet) > 250 else snippet

            # Special handling for edX course listing page description:
            if "edx.org/learn/criminal-law" in url and not data['ContentSnippet'].startswith("Content Not Found"):
                # edX uses a specific meta description which is cleaner
                meta_desc = soup.find('meta', attrs={'name': 'description'})
                if meta_desc and meta_desc.get('content'):
                    data['ContentSnippet'] = meta_desc.get('content')

    except requests.exceptions.RequestException as e:
        data['TopicName'] = f"ERROR: Fetch Failed"
        data['ContentSnippet'] = f"Request Error: {e}"
    except Exception as e:
        data['TopicName'] = f"ERROR: Scrape Failed"
        data['ContentSnippet'] = f"Scraping Error: {e}"

    return data

def append_to_csv(new_data: List[Dict[str, str]], filename: str):
    """Reads existing data, appends new data, and saves back to the file."""

    if not new_data:
        print("\n No new data scraped. File not updated.")
        return

    df_new = pd.DataFrame(new_data)

    # Read existing data or create a new DataFrame
    if os.path.exists(filename):
        try:
            df_existing = pd.read_csv(filename, header=0, encoding='utf-8')
        except Exception:
            df_existing = pd.DataFrame(columns=REQUIRED_COLUMNS)
        print(f" Found existing file '{filename}' with {len(df_existing)} entries.")
    else:
        df_existing = pd.DataFrame(columns=REQUIRED_COLUMNS)
        print(f" Creating new file '{filename}'.")

    # Ensure consistent columns before concatenating
    df_existing = df_existing.reindex(columns=REQUIRED_COLUMNS, fill_value='')
    df_new = df_new.reindex(columns=REQUIRED_COLUMNS, fill_value='')

    # Append new data
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)

    # Save the combined data back to the original file
    df_combined.to_csv(filename, index=False, encoding='utf-8')

    print("\n" + "="*80)
    print(f" craping complete! Criminal Law data successfully appended to '{filename}'.")
    print(f"Total entries now: {len(df_combined)}")
    print("="*80)

# MAIN EXECUTION
if __name__ == "__main__":
    # Gather new data from the fixed URL list
    scraped_data = []

    print(f"\n Starting scrape of {len(CRIMINAL_LAW_URLS)} pages...")
    for i, url in enumerate(CRIMINAL_LAW_URLS):
        print(f" Scraping source {i + 1}/{len(CRIMINAL_LAW_URLS)}")
        details = scrape_page_details(url)
        scraped_data.append(details)
        time.sleep(0.5) # Increased politeness factor for diverse sites

    # Append to the existing CSV file
    append_to_csv(scraped_data, CSV_PATH)


 Starting scrape of 3 pages...
 Scraping source 1/3
 Scraping source 2/3
 Scraping source 3/3
 Found existing file '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv' with 680 entries.

 craping complete! Criminal Law data successfully appended to '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv'.
Total entries now: 683


In [26]:
# Scrapes 21 unit pages from Khan Academy (AP Biology and HS Biology) by extracting the course unit title
# from the URL slug and the summary snippet from the page content, appending the results to the course resources CSV.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import re
from typing import Dict, Any, List


# Configuration

scrapers_dir = Path(__file__).resolve().parent if "__file__" in locals() else Path(os.getcwd())
project_root = scrapers_dir.parent

# Build output directory path inside data/Raw_data/
output_dir = project_root / "data" / "Raw_data"
output_dir.mkdir(parents=True, exist_ok=True)

# Define full CSV save path
CSV_FILENAME = "course_resources.csv"
CSV_PATH = output_dir / CSV_FILENAME

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# The required columns for the existing CSV
REQUIRED_COLUMNS = ['CourseName', 'TopicName', 'ResourceURL', 'ContentSnippet', 'ResourceType']

# Consolidated list of 21 unique unit URLs
FINAL_UNIT_TASKS = [
    # Biological Principles I - 11 URLs
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/chemistry-of-life"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/cell-structure-and-function"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/cellular-energetics"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/cell-communication-and-cell-cycle"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/heredity"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/gene-expression-and-regulation"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/natural-selection"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/ecology-ap"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/x16acb03e699817e9:simulations"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/worked-examples-ap-biology"},
    {"CourseName": "Biological Principles I", "URL": "https://www.khanacademy.org/science/ap-biology/ap-biology-standards-mappings"},

    # Ecology w/ Lab - 10 URLs
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:ecology-and-natural-systems"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:from-cells-to-organisms"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:the-cell-cycle-and-differentiation"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:energy-and-matter-in-biological-systems"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:gene-expression-and-regulation"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:inheritance-and-variation-of-traits"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:mechanisms-of-evolution-hs"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:common-ancestry-and-phylogeny"},
    {"CourseName": "Ecology w/ Lab", "URL": "https://www.khanacademy.org/science/hs-bio/x230b3ff252126bb6:biodiversity-and-human-impacts"},
]

def extract_topic_from_url(url: str) -> str:
    """
    Extracts a readable topic name from a Khan Academy URL slug.
    This logic is the primary method for unit pages.
    """
    # Clean up URL: remove query parameters, anchors, and trailing slashes
    url_cleaned = url.split('?')[0].split('#')[0].rstrip('/')

    # Extract the last meaningful segment (the topic/unit slug)
    segments = url_cleaned.split('/')
    topic_slug = segments[-1]

    # Handle the special case of the last segment being a unique identifier
    # (like x16acb03e699817e9:simulations) - keep only the descriptive part.
    if ':' in topic_slug:
        topic_slug = topic_slug.split(':')[-1]

    # If the slug is still a non-descriptive identifier (like x16acb03e699817e9),
    # check the segment before it for a topic name.
    if re.match(r'^x[0-9a-f]{16}', topic_slug, re.IGNORECASE) and len(segments) >= 2:
         topic_slug = segments[-2]
         if ':' in topic_slug:
             topic_slug = topic_slug.split(':')[-1]

    # Format the slug: replace hyphens/underscores with spaces and capitalize words
    topic_name = re.sub(r'[-_]+', ' ', topic_slug)

    # Capitalize the first letter of each word (Title Case)
    return topic_name.title()


def scrape_unit_page(task: Dict[str, Any]) -> Dict[str, str]:
    """
    Fetches the unit page and extracts the summary, using the URL for TopicName.
    """
    url = task['URL']
    course_name = task['CourseName']

    data = {
        'CourseName': course_name,
        'UnitURL': url,
        'UnitTitle': extract_topic_from_url(url),
        'UnitSummarySnippet': "Summary Not Found",
    }

    # Only scrape HTML for the summary
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract Unit Summary/Description
        summary_element = None
        # Target the main content area
        main_content = soup.find(class_=re.compile(r'main-content|unit-description|content-container'))
        if main_content:
            # Find the first paragraph in the main content area
            summary_element = main_content.find('p', recursive=True)

        if summary_element and summary_element.get_text(strip=True):
            raw_text = summary_element.get_text(strip=True)
            # Try to get the first full sentence
            match = re.search(r'(.*?([.!?]))\s*|\Z', raw_text)

            snippet = match.group(1).strip() if match and match.group(1) else raw_text.strip()
            data['UnitSummarySnippet'] = snippet[:500] if len(snippet) > 500 else snippet

    except requests.exceptions.RequestException:
        data['UnitSummarySnippet'] = "Request Error: Failed to fetch content for summary."
    except Exception:
        data['UnitSummarySnippet'] = "Scraping Error: Failed to parse summary."

    return data

def append_to_csv(new_data: List[Dict[str, str]], filename: str):
    """Appends new unit data to the existing CSV file."""
    if not new_data:
        return

    df_new = pd.DataFrame(new_data)

    # Map and clean columns for final CSV
    df_new = df_new.rename(columns={'UnitTitle': 'TopicName', 'UnitURL': 'ResourceURL', 'UnitSummarySnippet': 'ContentSnippet'})
    df_new['ResourceType'] = 'UNIT'
    df_new = df_new.reindex(columns=REQUIRED_COLUMNS, fill_value='')

    try:
        # Check if file exists and has content to decide on header
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            # Append without header
            df_new.to_csv(filename, mode='a', header=False, index=False, encoding='utf-8')
            print(f" Appended {len(df_new)} unit entries to existing file '{filename}'.")
        else:
            # Write with header (new or empty file)
            df_new.to_csv(filename, mode='w', header=True, index=False, encoding='utf-8')
            print(f" Created and wrote {len(df_new)} unit entries to new file '{filename}'.")

    except Exception as e:
        print(f"CRITICAL ERROR writing CSV: {e}. Data not saved.")


# MAIN EXECUTION
if __name__ == "__main__":
    scraped_data = []
    processed_urls = set()
    total_units = len(FINAL_UNIT_TASKS)

    print(f"1. Starting extraction and scrape of {total_units} Unit Pages...")

    for task in FINAL_UNIT_TASKS:
        url = task['URL']
        if url in processed_urls:
            continue

        details = scrape_unit_page(task)
        scraped_data.append(details)
        processed_urls.add(url)
        time.sleep(0.1)

    # Append the scraped data to the existing CSV file
    append_to_csv(scraped_data, CSV_PATH)

    print("\n Process Complete ")
    print("All unit entries, with TopicName extracted from the URL, have been processed.")

1. Starting extraction and scrape of 20 Unit Pages...
 Appended 20 unit entries to existing file '/Users/sarahkayembe/Documents/Database_cos557/Project phase 2/data/Raw_data/course_resources.csv'.

 Process Complete 
All unit entries, with TopicName extracted from the URL, have been processed.


In [None]:
# Automated system to fetch and log external learning resources (YouTube videos)
# for a list of courses using the YouTube API, storing results in a CSV.

import pandas as pd
from typing import List, Dict, Union
from pathlib import Path
from googleapiclient.discovery import build

# Configuration
API_KEY = '' # REPLACE THIS WITH YOUR ACTUAL API KEY
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

# Detect current script location
current_path = Path.cwd() 

# Go up to the project root
project_root = current_path.parent

# Constructing the full output path: project_root / 'data' / 'Raw_data'
output_dir = project_root / 'data' / 'Raw_data'

# Create the directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory setted.")

# Defining full save path
CSV_FILENAME = "course_resources.csv"
CSV_PATH = output_dir / CSV_FILENAME 

# NEW: Comprehensive List of Courses and Search Queries
COURSE_LIST = {
    # Non-CS Courses from previous list
    'Analytical Chemistry with Lab': 'Analytical Chemistry lab techniques',
    'Organic Chemistry I': 'Organic Chemistry I bonding and structure tutorial',
    'Business and Professional Communication': 'Professional communication skills training',
    'Environmental Economics': 'Introduction to Environmental Economics concepts',
    'Teaching Through the Arts': 'Integrating Arts into Classroom Teaching methods',
    'Studies in Irish Literature and Culture': 'Introduction to Irish Literature and Culture summary',
    'Calculus A': 'Calculus A limits and derivatives tutorial',
    'Linear Algebra': 'Introduction to Linear Algebra and matrices lecture',
    
    # New Computer Science Courses
    'Structured Problem Solving Java': 'Structured problem solving with Java tutorial',
    'Algorithms in Programming': 'Introduction to Algorithms and Data Structures',
    'Structured Programming Laboratory': 'Structured programming lab exercises',
    'Python Programming': 'Python programming complete course for beginners',
    'Programming Topics': 'Advanced programming topics and best practices',
    'Computer Organization': 'Computer Organization architecture and design',
    'Computer Organization Laboratory': 'Computer Organization lab exercises and assembly',
    'Discrete Mathematics II': 'Discrete Mathematics II advanced topics',
    'Data Structures': 'Data Structures fundamental concepts and examples',
    'Systems Programming': 'Systems Programming concepts and C language',
    'Programming Languages': 'Programming Languages paradigms and concepts',
    'Graphical User Interface Design': 'GUI Design principles and tools tutorial',
    'Numerical Analysis': 'Numerical Analysis techniques and methods',
    'Web Applications Development': 'Web Applications Development full stack tutorial',
    'Programming Autonomous Robots': 'Programming Autonomous Robots and ROS tutorial',
    'Professional Ethics and Social Impact of Computing': 'Ethics and Social Impact of Computing lecture',
    'Object-Oriented Design': 'UML and Object-Oriented Design patterns explanation',
    'Computing for Data Science': 'Introduction to Computing for Data Science',
    'Mobile Development': 'Introduction to Mobile Development frameworks',
    'Computational Text Analytics': 'Computational Text Analytics methods and tools',
    'Software Engineering': 'Software Engineering principles and SDLC',
    'Deep Learning': 'Deep Learning Neural Networks explained',
    'Software Project Management': 'Software Project Management methodologies',
    'Operating Systems': 'Operating Systems concepts and kernel design',
    'Computer Graphics': 'Computer Graphics fundamentals and rendering',
    'Database Systems': 'Database Systems SQL and NoSQL tutorial',
    'Computer Networks': 'Computer Networks protocols and TCP/IP stack',
    'Distributed Systems': 'Introduction to Distributed Systems concepts',
    'Compiler Construction': 'Compiler Construction phases and Lexical Analysis',
    'Topics in Computer Science': 'Advanced Topics in Computer Science lectures',
    'Artificial Intelligence and Data Mining': 'AI and Data Mining concepts and applications',
    'Machine Learning': 'Machine Learning fundamental concepts and algorithms',
    'Design and Analysis of Computing Algorithms': 'Design and Analysis of Algorithms detailed course',
    'Independent Study in Computer Science': 'Computer Science independent research ideas',
    'Computer Science Internship': 'Computer Science Internship preparation and advice',
}

# API FUNCTION
def fetch_youtube_video(course_name: str, query: str) -> Union[Dict[str, str], None]:
    """
    Uses the YouTube Data API to search for the most relevant video
    and formats the result as a resource entry.
    """
    print(f"\n-> Searching API for course: {course_name} with query: '{query}'")
    
    try:
        # Initialize the API client
        youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)
        
        # Execute the search.list request
        search_response = youtube.search().list(
            q=query,
            part='snippet',
            maxResults=1,  # Get only the top result
            type='video'
        ).execute()

        # Check if a video was found
        if not search_response.get('items'):
            print(f" No results found for {course_name}.")
            return None

        item = search_response['items'][0]
        video_id = item['id']['videoId']
        
        # Extract and format the data
        topic_data = {
            'CourseName': course_name,
            'TopicName': item['snippet']['title'],
            'ContentSnippet': item['snippet']['description'][:250] + "...",
            'ResourceURL': f'https://www.youtube.com/watch?v={video_id}',
            'ResourceType': 'YOUTUBE_VIDEO'
        }
        print(f" Successfully fetched video: {topic_data['TopicName']}")
        return topic_data

    except Exception as e:
        print(f" ERROR using YouTube API for {course_name}: {e}")
        print(" Please check your API key, quotas, and network connection.")
        return None

# --- APPEND FUNCTION ---
def append_to_csv(new_data: List[Dict[str, str]], filepath: Path):
    """
    Reads existing data, appends new data, and saves back to the file using
    the Path object for reliable path handling.
    """

    df_new = pd.DataFrame(new_data)

    # Read existing data, or create an empty DataFrame if file doesn't exist
    if filepath.exists(): 
        df_existing = pd.read_csv(filepath)
        print(f" Found existing file '{filepath.name}' with {len(df_existing)} entries.")
    else:
        # Create a new DataFrame with the correct column structure if the file doesn't exist
        df_existing = pd.DataFrame(columns=df_new.columns) 
        print(f" Creating new file '{filepath.name}'.")

    # Ensure consistent columns before concatenating
    df_existing = df_existing.reindex(columns=df_new.columns, fill_value='')

    # Append new data
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)

    # Save the combined data back to the original file
    df_combined.to_csv(filepath, index=False, encoding='utf-8')

    print("\n" + "="*70)
    print(f" Data successfully appended.")
    print(f"Total entries now: {len(df_combined)}")
    print("="*70)


# MAIN EXECUTION (LOOPING)
if __name__ == "__main__":
    
    # Initialize a list to hold all entries from all courses
    all_new_entries = []
    print(f"Starting API fetch for {len(COURSE_LIST)} courses...")

    # Loop through the list of courses
    for course_name, search_query in COURSE_LIST.items():
        
        # Fetch YouTube video data via API for the current course
        youtube_entry = fetch_youtube_video(course_name, search_query)
        
        if youtube_entry:
            all_new_entries.append(youtube_entry)

    # Append all collected entries to the CSV file
    if all_new_entries:
        print(f"\nFinished fetching. Appending {len(all_new_entries)} total entries to CSV.")
        append_to_csv(all_new_entries, CSV_PATH)
    else:
        print("\n No new entries were successfully generated to append to the CSV. Check API key and quota.")

Output directory setted.
Starting API fetch for 43 courses...

-> Searching API for course: Analytical Chemistry with Lab with query: 'Analytical Chemistry lab techniques'
 Successfully fetched video: Top 5 Lab Techniques Every Chemistry Researcher Must Know

-> Searching API for course: Organic Chemistry I with query: 'Organic Chemistry I bonding and structure tutorial'
 Successfully fetched video: Organic Chemistry Drawing Structures - Bond Line, Skeletal, and Condensed Structural Formulas

-> Searching API for course: Business and Professional Communication with query: 'Professional communication skills training'
 Successfully fetched video: Give me 8 minutes, and I&#39;ll improve your communication skills by 88%...

-> Searching API for course: Environmental Economics with query: 'Introduction to Environmental Economics concepts'
 Successfully fetched video: Environmental Econ: Crash Course Economics #22

-> Searching API for course: Teaching Through the Arts with query: 'Integrati