In [None]:
# COLLECTING COLLEGES NAMES AND STATES
"""
U.S. College Name Scraper
-------------------------
This script scrapes lists of American colleges and universities from Wikipedia.
It starts from the main "Lists of American universities and colleges" hub page,
follows each state-specific link, and extracts institution names from tables.

Key Features:
- Dynamically fetches and parses all U.S. state pages from Wikipedia.
- Cleans data to remove citations and duplicates.
- Outputs two CSV files:
    - ../data/us_colleges.csv          → Raw scraped data
    - ../data/clean/us_colleges_clean.csv → Cleaned, deduplicated data
- Includes retry logic and polite delays to avoid IP throttling.
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from io import StringIO
import warnings
import os
import time
import random
from pathlib import Path

# suppress pandas warning about SSL/certificate checks
warnings.filterwarnings("ignore", category=UserWarning)

# configurations
BASE_URL = "https://en.wikipedia.org"
HUB_PAGE_URL = "https://en.wikipedia.org/wiki/Lists_of_American_universities_and_colleges"

headers = {
    # using a standard User-Agent to bypass the 403 error
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# function to get all state links
def get_all_state_links():
    """Fetches the hub page and extracts all state list URLs."""
    state_urls = {}
    print(f"Fetching link hub: {HUB_PAGE_URL}")

    # 
    for attempt in range(3):
        try:
            response = requests.get(HUB_PAGE_URL, headers=headers, timeout=15)
            if response.status_code == 200:
                break
            else:
                print(f"Hub attempt {attempt+1}: status {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Hub attempt {attempt+1} failed: {e}")
            time.sleep(2 + random.random())
    else:
        print("Failed to fetch hub page after 3 attempts.")
        return {}
    #

    soup = BeautifulSoup(response.content, 'html.parser')
    body_content = soup.find(id="bodyContent")

    if body_content:
        state_links = body_content.find_all(
            'a',
            href=lambda href: href and href.startswith('/wiki/List_of_colleges_and_universities_in_')
        )
        for link in state_links:
            state_name = link.get_text(strip=True)
            absolute_url = urljoin(BASE_URL, link['href'])
            state_urls[state_name] = absolute_url

    return state_urls


# function to Scrape Colleges from a Single State Page (CLEANED)
def scrape_colleges_from_state_page(url, state_name):
    """Finds the wikitable, reads it with pandas, and extracts the college names."""
    colleges = []

    # using 'School' as the primary match
    possible_college_columns = ['School', 'Institution', 'University', 'College', 'Name', 0]

    for attempt in range(3):
        try:
            response = requests.get(url, headers=headers, timeout=15)
            if response.status_code == 200:
                break
            else:
                print(f"Attempt {attempt+1}: {state_name} returned status {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt+1}: Error fetching {state_name} ({e})")
            time.sleep(2 + random.random())  # wait a bit before retry
    else:
        print(f"Skipping {state_name} after 3 failed attempts.")
        return colleges
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # using BeautifulSoup to find all tables with the specific Wikipedia class
    wiki_tables = soup.find_all('table', {'class': 'wikitable'})

    if not wiki_tables:
        return colleges

    # prioritizing the table with the most rows
    best_table = None
    best_size = 0
    for table in wiki_tables:
        row_count = len(table.find_all('tr'))
        if row_count > best_size:
            best_size = row_count
            best_table = table

    if not best_table:
        return colleges

    table_html = str(best_table)

    # using pandas.read_html on the SINGLE table's HTML string
    data_frames = pd.read_html(StringIO(table_html), header=0)

    if not data_frames:
            return colleges

    df = data_frames[0]

    # search the DataFrame's columns for the college name
    for col_name in possible_college_columns:
        if col_name in df.columns:
            names = df[col_name].dropna().astype(str).tolist()
            # cleaning by removing bracketed citations like [1], [a], etc.
            names = [name.split('[')[0].strip() for name in names]
            colleges.extend(names)
            time.sleep(1 + random.random()) 
            return colleges # Success! Exit the function

    return colleges

# final main execution loop (FULL CRAWL)
def run_scraper():
    """Executes the entire scraping process across all states and saves to CSV."""
    all_college_data = []

    # get all state links
    state_links = get_all_state_links()
    if not state_links:
        print("Scraper aborted: Could not get state links.")
        return

    print(f"Found {len(state_links)} state/territory links.")
    print(f"\nStarting full scraping process for all {len(state_links)} pages...")

    # iterating through each state and scraping its page
    for i, (state, url) in enumerate(state_links.items()):
        print(f"  [{i+1}/{len(state_links)}] Scraping colleges in {state}...")

        colleges = scrape_colleges_from_state_page(url, state)

        if colleges:
            all_college_data.append({
                'State': state,
                'Colleges': colleges,
                'Count': len(colleges)
            })
            print(f"    -> Extracted {len(colleges)} institutions.")
        else:
            print(f"    -> No college data extracted for {state}. (Structure may be non-standard)")

    # final Aggregation and Output
    total_institutions = sum(item['Count'] for item in all_college_data)

    print("\n==============================================")
    print("SCRAPING COMPLETE")
    print("==============================================")
    print(f"Total states/territories successfully scraped: {len(all_college_data)} out of {len(state_links)}")
    print(f"Total institutions collected: {total_institutions:,}")
    print("----------------------------------------------")

    # converting to a flat DataFrame and saving the data
    all_colleges_flat = []
    for item in all_college_data:
        for college_name in item['Colleges']:
            all_colleges_flat.append({'State': item['State'], 'College Name': college_name})

    df_final = pd.DataFrame(all_colleges_flat)

    output_filename = "us_colleges.csv"
    current_dir = Path(os.getcwd())                          
    # If currently inside 'scrapers', move up one level to project root
    if current_dir.name == "scrapers":
        project_root = current_dir.parent
    else:
        project_root = current_dir

    # Define path to data/Raw_data
    data_dir = project_root / "data" / "Raw_data"
    data_dir.mkdir(parents=True, exist_ok=True)

    # Final file path
    output_path = data_dir / output_filename

    print(f"\nSaving data to: {output_path}")
    df_final.to_csv(output_path, index=False, encoding="utf-8")
    print("Finished. Data saved successfully.")

# executing the final function
if __name__ == "__main__":
    run_scraper()

In [None]:
# Cleaning the U.S. Colleges CSV
"""
This script loads the raw 'us_colleges.csv' from data/Raw_data/,
cleans and deduplicates it, and saves the final file into data/clean/.
"""

import os
import pandas as pd
from pathlib import Path

# Path setup (works in both scrapers/ and project root)

current_dir = Path(os.getcwd())

# If inside 'scrapers', go up to project root
if current_dir.name == "scrapers":
    project_root = current_dir.parent
else:
    project_root = current_dir

# Define input and output folders
raw_data_dir = project_root / "data" / "Raw_data"
clean_data_dir = project_root / "data" / "clean"
clean_data_dir.mkdir(parents=True, exist_ok=True)

# File paths
input_path = raw_data_dir / "us_colleges.csv"
output_path = clean_data_dir / "us_colleges_clean.csv"

# Load the CSV file
print(f"Loading file from: {input_path}")
df = pd.read_csv(input_path)

# Normalize column names and values
df['College Name'] = df['College Name'].astype(str).str.strip()
df['State'] = df['State'].astype(str).str.strip()

# Detect and show exact duplicates
duplicates_mask = df.duplicated(subset=['College Name', 'State'], keep='first')
duplicates_df = df[duplicates_mask]

if not duplicates_df.empty:
    print("\nExact duplicates to be removed:")
    for _, row in duplicates_df.iterrows():
        print(f" - {row['College Name']} ({row['State']})")
else:
    print("\nNo exact duplicates found to remove.")

# Drop duplicates and sort
df_clean = df.drop_duplicates(subset=['College Name', 'State'], keep='first')
df_clean = df_clean.sort_values(by=['State', 'College Name']).reset_index(drop=True)

# Save cleaned data
df_clean.to_csv(output_path, index=False, encoding='utf-8')

# Summary
print("\n--------------------------------")
print(f"Before: {len(df)} rows")
print(f"After:  {len(df_clean)} rows")
print(f"Removed: {len(df) - len(df_clean)} exact duplicates")
print(f"Clean file saved to: {output_path}")
