In [2]:
"""
USM Quiz Scraper (VS Code Version - Structured for SQL Import)
--------------------------------------------------------------
Searches the web (DuckDuckGo + Bing fallback) for quiz resources related
to courses listed in '../data/clean/usm_courses_clean.csv', then converts
them into a structure compatible with the Quiz, Question, and Answer
tables defined in the Quizzes & Flashcards schema.

Output:
    ../data/quiz_data.csv
"""

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time, random, os

# Folder paths
current_dir = os.getcwd()                           # .../scrapers
data_dir = os.path.join(current_dir, "..", "data")  # ../data
clean_dir = os.path.join(data_dir, "clean")         # ../data/clean
os.makedirs(data_dir, exist_ok=True)

input_path = os.path.join(clean_dir, "usm_courses_clean.csv")
output_path = os.path.join(data_dir, "quiz_data.csv")

# Load course data
print(f"Loading courses")
df_courses = pd.read_csv(input_path).dropna(subset=["course_name"]).drop_duplicates()
df_courses = df_courses.head(30)  # limit for testing
print(f"Loaded {len(df_courses)} courses.\n")

# DuckDuckGo / Bing scraping helpers
def duckduckgo_search(query, max_results=5):
    url = "https://duckduckgo.com/html/"
    params = {"q": query}
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(url, params=params, headers=headers, timeout=15)
        r.raise_for_status()
    except Exception as e:
        print(f"DuckDuckGo error: {e}")
        return []
    soup = BeautifulSoup(r.text, "html.parser")
    results = []
    for link in soup.select(".result__a")[:max_results]:
        results.append({"title": link.text.strip(), "url": link["href"]})
    return results

def bing_search(query, max_results=5):
    url = "https://www.bing.com/search"
    params = {"q": query}
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(url, params=params, headers=headers, timeout=15)
        r.raise_for_status()
    except Exception as e:
        print(f"Bing error: {e}")
        return []
    soup = BeautifulSoup(r.text, "html.parser")
    results = []
    for li in soup.select("li.b_algo")[:max_results]:
        a = li.find("a")
        if a and a.get("href"):
            results.append({"title": a.text.strip(), "url": a["href"]})
    return results

def get_search_results(query, max_results=5):
    results = duckduckgo_search(query, max_results)
    if not results:
        results = bing_search(query, max_results)
    return results

# Main scraper
def gather_quiz_data(df):
    quiz_records = []
    question_records = []
    answer_records = []

    quiz_id = 1
    question_id = 1
    answer_id = 1

    for i, row in df.iterrows():
        course_name = row["course_name"]
        print(f"[{i+1}/{len(df)}] Searching quiz content for: {course_name}")

        quiz_query = f"{course_name} quiz questions PDF OR multiple choice OR test bank"
        results = get_search_results(quiz_query)

        for r in results:
            quiz_title = r["title"][:255]
            quiz_desc = f"Auto-scraped quiz reference for {course_name}"
            quiz_records.append({
                "quiz_id": quiz_id,
                "title": quiz_title,
                "description": quiz_desc,
                "course_name": course_name,
                "creator_id": 1
            })

            # Placeholder simulated questions for structure
            for q_num in range(1, 4):
                q_text = f"Sample Question {q_num} for {quiz_title}"
                q_type = random.choice(["multiple_choice", "true_false", "short_answer"])
                question_records.append({
                    "question_id": question_id,
                    "quiz_id": quiz_id,
                    "question_text": q_text,
                    "question_type": q_type,
                    "points": 1
                })

                if q_type == "multiple_choice":
                    for opt in ["A", "B", "C", "D"]:
                        answer_records.append({
                            "answer_id": answer_id,
                            "question_id": question_id,
                            "answer_text": f"Option {opt} for {q_text}",
                            "is_correct": 1 if opt == "A" else 0
                        })
                        answer_id += 1
                question_id += 1
            quiz_id += 1

        print(f"  Added {len(results)} quizzes for {course_name}")
        time.sleep(2 + random.random())

    quiz_df = pd.DataFrame(quiz_records)
    question_df = pd.DataFrame(question_records)
    answer_df = pd.DataFrame(answer_records)
    return quiz_df, question_df, answer_df

def main():
    print("Starting quiz scraping...")
    quiz_df, question_df, answer_df = gather_quiz_data(df_courses)

    # Save all tables as separate CSVs for import
    quiz_df.to_csv(os.path.join(data_dir, "quiz_table.csv"), index=False, encoding="utf-8")
    question_df.to_csv(os.path.join(data_dir, "question_table.csv"), index=False, encoding="utf-8")
    answer_df.to_csv(os.path.join(data_dir, "answer_table.csv"), index=False, encoding="utf-8")

    print(f"Saved {len(quiz_df)} quizzes, {len(question_df)} questions, {len(answer_df)} answers to the data folder.")

if __name__ == "__main__":
    main()


Loading courses
Loaded 30 courses.

Starting quiz scraping...
[1/30] Searching quiz content for: Analysis of Archaeological Materials
DuckDuckGo error: HTTPSConnectionPool(host='duckduckgo.com', port=443): Max retries exceeded with url: /html/?q=Analysis+of+Archaeological+Materials+quiz+questions+PDF+OR+multiple+choice+OR+test+bank (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x115bebf20>, 'Connection to duckduckgo.com timed out. (connect timeout=15)'))
  Added 5 quizzes for Analysis of Archaeological Materials
[2/30] Searching quiz content for: Environmental Archaeology
DuckDuckGo error: HTTPSConnectionPool(host='duckduckgo.com', port=443): Max retries exceeded with url: /html/?q=Environmental+Archaeology+quiz+questions+PDF+OR+multiple+choice+OR+test+bank (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x115beb680>, 'Connection to duckduckgo.com timed out. (connect timeout=15)'))
  Added 5 quizzes for Environmental Archaeolo