In [None]:
# COLLECTING COLLEGES NAMES AND STATES

import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from io import StringIO
import warnings

# suppress pandas warning about SSL/certificate checks
warnings.filterwarnings("ignore", category=UserWarning)

# configurations
BASE_URL = "https://en.wikipedia.org"
HUB_PAGE_URL = "https://en.wikipedia.org/wiki/Lists_of_American_universities_and_colleges"

headers = {
    # using a standard User-Agent to bypass the 403 error
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# function to Get All State Links
def get_all_state_links():
    """Fetches the hub page and extracts all state list URLs."""
    state_urls = {}
    print(f"Fetching link hub: {HUB_PAGE_URL}")

    try:
        response = requests.get(HUB_PAGE_URL, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        body_content = soup.find(id="bodyContent")

        if body_content:
            state_links = body_content.find_all(
                'a',
                href=lambda href: href and href.startswith('/wiki/List_of_colleges_and_universities_in_')
            )
            for link in state_links:
                state_name = link.get_text(strip=True)
                absolute_url = urljoin(BASE_URL, link['href'])
                state_urls[state_name] = absolute_url

    except requests.exceptions.RequestException as e:
        print(f"Error fetching hub page: {e}")

    return state_urls

# function to Scrape Colleges from a Single State Page (CLEANED)
def scrape_colleges_from_state_page(url, state_name):
    """Finds the wikitable, reads it with pandas, and extracts the college names."""
    colleges = []

    # using 'School' as the primary match
    possible_college_columns = ['School', 'Institution', 'University', 'College', 'Name', 0]

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # using BeautifulSoup to find all tables with the specific Wikipedia class
        wiki_tables = soup.find_all('table', {'class': 'wikitable'})

        if not wiki_tables:
            return colleges

        # prioritizing the table with the most rows
        best_table = None
        best_size = 0
        for table in wiki_tables:
            row_count = len(table.find_all('tr'))
            if row_count > best_size:
                best_size = row_count
                best_table = table

        if not best_table:
            return colleges

        table_html = str(best_table)

        # using pandas.read_html on the SINGLE table's HTML string
        data_frames = pd.read_html(StringIO(table_html), header=0)

        if not data_frames:
             return colleges

        df = data_frames[0]

        # search the DataFrame's columns for the college name
        for col_name in possible_college_columns:
            if col_name in df.columns:
                names = df[col_name].dropna().astype(str).tolist()
                # cleaning by removing bracketed citations like [1], [a], etc.
                names = [name.split('[')[0].strip() for name in names]
                colleges.extend(names)
                return colleges # Success! Exit the function

    except Exception: # A broad exception handles network or pandas parsing errors
        pass

    return colleges

# final Main Execution Loop (FULL CRAWL)
def run_scraper():
    """Executes the entire scraping process across all states and saves to CSV."""
    all_college_data = []

    # get all state links
    state_links = get_all_state_links()
    if not state_links:
        print("Scraper aborted: Could not get state links.")
        return

    print(f"Found {len(state_links)} state/territory links.")
    print(f"\nStarting full scraping process for all {len(state_links)} pages...")

    # iterating through each state and scraping its page
    for i, (state, url) in enumerate(state_links.items()):
        print(f"  [{i+1}/{len(state_links)}] Scraping colleges in {state}...")

        colleges = scrape_colleges_from_state_page(url, state)

        if colleges:
            all_college_data.append({
                'State': state,
                'Colleges': colleges,
                'Count': len(colleges)
            })
            print(f"    -> Extracted {len(colleges)} institutions.")
        else:
            print(f"    -> No college data extracted for {state}. (Structure may be non-standard)")

    # final Aggregation and Output
    total_institutions = sum(item['Count'] for item in all_college_data)

    print("\n==============================================")
    print("SCRAPING COMPLETE")
    print("==============================================")
    print(f"Total states/territories successfully scraped: {len(all_college_data)} out of {len(state_links)}")
    print(f"Total institutions collected: {total_institutions:,}")
    print("----------------------------------------------")

    # converting to a flat DataFrame and saving the data
    all_colleges_flat = []
    for item in all_college_data:
        for college_name in item['Colleges']:
            all_colleges_flat.append({'State': item['State'], 'College Name': college_name})

    df_final = pd.DataFrame(all_colleges_flat)

    output_filename = 'us_colleges.csv'
    print(f"\nSaving data to '{output_filename}'...")
    df_final.to_csv(output_filename, index=False, encoding='utf-8')
    print("Finished. Data saved successfully.")

# executing the final function
run_scraper()