In [None]:
"""
USM Course Resource Finder (VS Code Version)
--------------------------------------------
Searches DuckDuckGo (with Bing fallback) for educational resources based on
the courses listed in '../data/clean/usm_courses_clean.csv'. All results are
saved to '../data/course_resources.csv'.
"""

import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
import random
import os

# File paths (run from /scrapers)
current_dir = os.getcwd()                            
data_dir = os.path.join(current_dir, "..", "data")   
clean_dir = os.path.join(data_dir, "clean")          
os.makedirs(data_dir, exist_ok=True)

input_path = os.path.join(clean_dir, "usm_courses_clean.csv")
output_path = os.path.join(data_dir, "course_resources.csv")

# Loading cleaned courses
print(f" Loading courses")
df_courses = pd.read_csv(input_path).dropna(subset=["course_name"]).drop_duplicates()

# Optional limit for testing
df_courses = df_courses.head(50)
print(f" Loaded {len(df_courses)} courses to search for resources.\n")

# Searching engine helper functions
def duckduckgo_search(query, max_results=5):
    """Fetch search results from DuckDuckGo."""
    url = "https://duckduckgo.com/html/"
    params = {"q": query}
    headers = {"User-Agent": "Mozilla/5.0"}

    print(f"   [DuckDuckGo] Searching for: {query[:60]}...")
    try:
        r = requests.get(url, params=params, headers=headers, timeout=15)
        print(f"   Response: {r.status_code}, {len(r.text)} bytes")
        r.raise_for_status()
    except Exception as e:
        print(f"   DuckDuckGo failed: {e}")
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    results = []
    for link in soup.select(".result__a")[:max_results]:
        results.append({
            "query": query,
            "title": link.text.strip(),
            "url": link.get("href")
        })

    print(f"    Found {len(results)} results from DuckDuckGo.")
    return results


def bing_search(query, max_results=5):
    """Fetch search results from Bing."""
    url = "https://www.bing.com/search"
    params = {"q": query}
    headers = {"User-Agent": "Mozilla/5.0"}

    print(f"   [Bing] Searching for: {query[:60]}...")
    try:
        r = requests.get(url, params=params, headers=headers, timeout=15)
        print(f"   Response: {r.status_code}, {len(r.text)} bytes")
        r.raise_for_status()
    except Exception as e:
        print(f"    Bing failed: {e}")
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    results = []
    for li in soup.select("li.b_algo")[:max_results]:
        a = li.find("a")
        if a and a.get("href"):
            results.append({
                "query": query,
                "title": a.text.strip(),
                "url": a.get("href")
            })

    print(f"    Found {len(results)} results from Bing.")
    return results

# Combining search function with fallback
def get_resources_for_course(course_name, max_results=5):
    """Try DuckDuckGo first, then Bing if it fails or returns nothing."""
    query = f"{course_name} course resources PDF OR lecture notes OR open courseware"
    results = duckduckgo_search(query, max_results=max_results)
    if not results:
        print(f"    Falling back to Bing for '{course_name}'...")
        results = bing_search(query, max_results=max_results)
    return results

# Gathering resources for all courses
def gather_resources(df):
    all_data = []
    for i, row in df.iterrows():
        course_name = row["course_name"]
        print(f"\n [{i+1}/{len(df)}] Searching resources for: {course_name}")
        try:
            resources = get_resources_for_course(course_name, max_results=5)
            all_data.extend(resources)
        except Exception as e:
            print(f"  Unexpected error for '{course_name}': {e}")
        time.sleep(2 + random.random())  # polite delay
    return all_data

# Running the scraper
def main():
    print(" Starting course resource search...")
    data = gather_resources(df_courses)

    if not data:
        print("\n No resources found. Check your internet connection or try again later.")
        return

    df = pd.DataFrame(data)
    df.drop_duplicates(subset="url", inplace=True)
    df.to_csv(output_path, index=False, encoding="utf-8")

    print(f"\n Done! Saved {len(df)} unique resources to: {output_path}")

if __name__ == "__main__":
    main()


 Loading courses
 Loaded 50 courses to search for resources.

 Starting course resource search...

 [1/50] Searching resources for: Analysis of Archaeological Materials
   [DuckDuckGo] Searching for: Analysis of Archaeological Materials course resources PDF OR...
   DuckDuckGo failed: HTTPSConnectionPool(host='duckduckgo.com', port=443): Max retries exceeded with url: /html/?q=Analysis+of+Archaeological+Materials+course+resources+PDF+OR+lecture+notes+OR+open+courseware (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x10e7a5850>, 'Connection to duckduckgo.com timed out. (connect timeout=15)'))
    Falling back to Bing for 'Analysis of Archaeological Materials'...
   [Bing] Searching for: Analysis of Archaeological Materials course resources PDF OR...
   Response: 200, 109423 bytes
    Found 5 results from Bing.

 [2/50] Searching resources for: Environmental Archaeology
   [DuckDuckGo] Searching for: Environmental Archaeology course resources PDF OR lecture