## Import Necessary Libraries and Functions
We are using Selenium - in particular webdriver, By, and NoSuchElementException, time, and os. 

In [5]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
import os
import pandas as pd


In [None]:
def wait_for_downloads(variables, year, directory, timeout = 200):
    start_time = time.time()
    expected_files = []

    for var in variables:

        if year < 2021:
            expected_files.append(f"DIST{var}.dat" if var != "REF" else "DREF.dat")
        else:
            expected_files.append(f"DIST{var}.csv" if var != "REF" else "DREF.csv")

    while time.time() - start_time < timeout:
        downloaded_files = os.listdir(directory)
        
        if all(file in downloaded_files and not file.endswith(".crdownload") for file in expected_files):
            print(f"All downloads for {year} completed successfully.")
            print("")
            return True
        print("Waiting for all files to download...")
        time.sleep(5)

    return False

## Function for TAPR Scraping
This function takes two list inputs and a string:
* years is a list of YYYY integers (ex. 2018)
* variables is a list of strings where each string is a dataset available on TAPR (ex. PROF)
The function saves all .csv or .dat files to your local directory in separate directories for each year. 

In [21]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

def tapr_scraper(years, variables):
    dir = os.getcwd()  # getting the current working directory to save the files to
    print(dir)
    for year in years:
        # access TAPR for each year
        dir_name = f"raw_data{year}" # name a directory to store the raw data for that particular year
        os.makedirs(dir_name, exist_ok=True) # create the directory unless it already exists
        chromeOptions = webdriver.ChromeOptions() #creating chrome options object
        prefs = {"download.default_directory" : f"{dir}/{dir_name}"}
        chromeOptions.add_experimental_option("prefs", prefs)
        driver = webdriver.Chrome(options=chromeOptions)  # all files that are downloaded in the driver will now download to your directory (instead of "Downloads" folder)
        driver.get(f"https://rptsvr1.tea.texas.gov/perfreport/tapr/{year}/download/DownloadData.html")  # open the TAPR url, depending on the year
        # select district
        district_select = driver.find_element(By.XPATH, "//input[@type='radio' and @name='sumlev' and @value='D']")  # select district level data
        district_select.click()  # click the button
        cant_download = []
        print(f"Downloading Data for {year}")
        roots = [] # initializing an empty list to keep track of the variables 
        for var in variables:  # for each data set
            print(f"Checking for DIST{var} data...")
            file_patterns = [f"DIST{var}.csv", f"DIST{var}.dat", f"D{var}.dat", f"D{var}.csv"]
            if any(os.path.isfile(f"{dir}/{dir_name}/{file}") for file in file_patterns):
                print(f"{var} already exists")
                cant_download.append(var)
                continue
            try:
                select_data = driver.find_element(By.XPATH, f"//input[@type='radio' and @name='setpick' and @value='{var}']")
                select_data.click()  # click the data button
                download = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Continue']")
                download.click()  # download the data
                print(f"{var} downloaded for {year}")
            except NoSuchElementException:
                print(f"{var} not found for {year}")  # in case it doesn't exist, such as PERF1 not existing in earlier data
                cant_download.append(var)
                continue
        if already_saved_or_not_available != len(variables):
            time.sleep(60) # wait for all files to download before quitting the driver
        driver.quit() # quit driver for that year

## Manav: Altered TAPR Scraper Function 
Updates: 
* Specify if you want campus, district, region, or state level data
* Did not have to manually download each file to the path
* Created the folders properly
* Added addiitonal Chrome preferences 
* Gets the master refrence file (column name keys) and attaches it to the yearly documents

In [None]:
def tapr_scraper2(years, variables, directory_path_name, level):
    """
    Scrape TAPR data for specified years, variables, and level if data.
    
    Parameters:
    years (list): List of years to scrape data for
    variables (list): List of variable codes to download
    directory_path_name (str): Base directory to save files
    level (str): Administrative level to scrape. Options:
        'C' for Campus
        'D' for District
        'R' for Region
        'S' for State
    """
    # Validation for level parameter
    valid_levels = {
        'C': 'Campus',
        'D': 'District',
        'R': 'Region',
        'S': 'State'
    }
    
    if level not in valid_levels:
        raise ValueError(f"Invalid level. Must be one of: {', '.join(valid_levels.keys())}")
    
    # Create prefix for filenames based on level
    file_prefix = {
        'C': 'CAMP',
        'D': 'DIST',
        'R': 'REGN',
        'S': 'STATE'
    }[level]
    
    # Ensure the base directory exists
    os.makedirs(directory_path_name, exist_ok=True)
    
    for year in years:
        # Create full path for year directory
        dir_name = f"raw_data{year}"
        full_dir_path = os.path.join(directory_path_name, dir_name)
        os.makedirs(full_dir_path, exist_ok=True)
        
        # Configure Chrome options
        chrome_options = webdriver.ChromeOptions()
        absolute_download_path = os.path.abspath(full_dir_path)
        
        # Add additional Chrome preferences to prevent download prompts
        prefs = {
            "download.default_directory": absolute_download_path,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True,
            "profile.default_content_settings.popups": 0
        }
        chrome_options.add_experimental_option("prefs", prefs)
        
        # Add additional Chrome arguments
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(f"https://rptsvr1.tea.texas.gov/perfreport/tapr/{year}/download/DownloadData.html")
        
        # Select appropriate level
        level_select = driver.find_element(By.XPATH, f"//input[@type='radio' and @name='sumlev' and @value='{level}']")
        level_select.click()
        
        already_saved_or_not_available = 0
        print(f"Downloading {valid_levels[level]} Level Data for {year}")
        
        for var in variables:
            print(f"Checking for {file_prefix}{var} data...")
            # Updated file patterns to include level prefix and year
            file_patterns = [
                f"{file_prefix}{var}_{year}.csv",
                f"{file_prefix}{var}_{year}.dat",
                f"{var}_{year}.dat",  # Some files might not include the prefix
                f"{var}_{year}.csv"
            ]
            
            if any(os.path.isfile(os.path.join(full_dir_path, file)) for file in file_patterns):
                print(f"{var}_{year} already exists")
                already_saved_or_not_available += 1
                continue
                
            try:
                select_data = driver.find_element(By.XPATH, f"//input[@type='radio' and @name='setpick' and @value='{var}']")
                select_data.click()
                
                # Add a small delay after clicking the radio button
                time.sleep(1)
                
                download = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Continue']")
                download.click()
                print(f"Downloaded {var} for {year}")
                
                # Add a delay to ensure the download starts
                time.sleep(3)
                
                # Rename the downloaded file to include the year
                # Wait for the download to complete
                max_wait = 30
                start_time = time.time()
                while time.time() - start_time < max_wait:
                    # Check for both .csv and .dat files
                    for ext in ['.csv', '.dat']:
                        old_patterns = [
                            f"{file_prefix}{var}{ext}",
                            f"{var}{ext}"  # Some files might not include the prefix
                        ]
                        
                        for old_pattern in old_patterns:
                            old_name = os.path.join(full_dir_path, old_pattern)
                            if os.path.exists(old_name):
                                new_name = os.path.join(full_dir_path, f"{file_prefix}{var}_{year}{ext}")
                                os.rename(old_name, new_name)
                                break
                        
                        # If we found and renamed a file, break the inner loop
                        if 'new_name' in locals() and os.path.exists(new_name):
                            break
                    
                    # If we found and renamed a file, break the outer loop
                    if 'new_name' in locals() and os.path.exists(new_name):
                        break
                        
                    time.sleep(1)
                
            except NoSuchElementException:
                print(f"{var} not found for {year}")
                already_saved_or_not_available += 1
                continue
        
        if already_saved_or_not_available != len(variables):
            # Wait for downloads to complete by checking file sizes
            max_wait = 120  # Maximum wait time in seconds
            start_time = time.time()
            while time.time() - start_time < max_wait:
                time.sleep(5)
                # Check if any file in the directory is still being downloaded (ends with .crdownload)
                if not any(f.endswith('.crdownload') for f in os.listdir(full_dir_path)):
                    break
                
        driver.quit()

/Users/biancaschutz/HERC_Sp25
Downloading Data for 2018
Checking for DISTPROF data...
PROF downloaded for 2018
Checking for DISTPERF1 data...
PERF1 not found for 2018
Checking for DISTGRAD data...
GRAD downloaded for 2018
Checking for DISTSTAAR1 data...
STAAR1 downloaded for 2018
Checking for DISTREF data...
REF downloaded for 2018
Checking for DISTPERF data...
PERF downloaded for 2018
All downloads for 2018 completed successfully.
Downloading Data for 2019
Checking for DISTPROF data...
PROF downloaded for 2019
Checking for DISTPERF1 data...
PERF1 not found for 2019
Checking for DISTGRAD data...
GRAD downloaded for 2019
Checking for DISTSTAAR1 data...
STAAR1 downloaded for 2019
Checking for DISTREF data...
REF downloaded for 2019
Checking for DISTPERF data...
PERF downloaded for 2019
All downloads for 2019 completed successfully.
Downloading Data for 2020
Checking for DISTPROF data...
PROF downloaded for 2020
Checking for DISTPERF1 data...
PERF1 not found for 2020
Checking for DISTGRAD

In [None]:
file_path = r"C:\\Users\\mmath\\OneDrive\\Desktop\\Capstone\\raw_data"

tapr_scraper2(years = [2022],
              variables = ['PROF', 'GRAD'],
                directory_path_name = file_path,
                level = 'C')

Downloading Campus Level Data for 2022
Checking for CAMPPROF data...
Downloaded PROF for 2022
Checking for CAMPGRAD data...
GRAD_2022 already exists


## WIP CODE: 
Joining the reference files to all docuements
Not functional as of now

In [None]:
def get_column_references(year, directory_path_name, level='D'):
    """
    Downloads the master reference file and extracts column names for each dataset.
    
    Parameters:
    year (str): Year to get references for
    directory_path_name (str): Directory to save and process files
    level (str): Administrative level ('C' for Campus, 'D' for District, 
                 'R' for Region, 'S' for State)
    
    Returns:
    dict: Dictionary with dataset names as keys and their column names as values
    """
    # Validation for level parameter and mapping to full names
    valid_levels = {
        'C': 'campus',
        'D': 'district',
        'R': 'region',
        'S': 'state'
    }
    
    if level not in valid_levels:
        raise ValueError(f"Invalid level. Must be one of: {', '.join(valid_levels.keys())}")
    
    # Setup Chrome options
    chrome_options = webdriver.ChromeOptions()
    absolute_download_path = os.path.abspath(directory_path_name)
    
    prefs = {
        "download.default_directory": absolute_download_path,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
        "profile.default_content_settings.popups": 0
    }
    chrome_options.add_experimental_option("prefs", prefs)
    chrome_options.add_argument("--headless")
    
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        # Navigate to the download page
        driver.get(f"https://rptsvr1.tea.texas.gov/perfreport/tapr/{year}/download/DownloadData.html")
        
        # First find and click the main reference file section
        # Look for text that contains "Master Reference (Excel format)"
        main_ref_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Master Reference')]")
        main_ref_link.click()
        
        # Wait for the level options to be available
        time.sleep(2)
        
        # Now click the specific level link
        level_name = valid_levels[level]
        level_link = driver.find_element(By.XPATH, f"//a[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{level_name}')]")
        level_link.click()
        
        # Wait for download to complete
        time.sleep(5)
        
        # Find the downloaded file - now looking for level-specific naming
        ref_file = None
        max_wait = 30
        start_time = time.time()
        
        while time.time() - start_time < max_wait:
            for file in os.listdir(absolute_download_path):
                if file.endswith('.xlsx') and level_name in file.lower():
                    ref_file = os.path.join(absolute_download_path, file)
                    break
            if ref_file:
                break
            time.sleep(1)
        
        if not ref_file:
            raise Exception(f"Reference file for {level_name} not found after {max_wait} seconds")
        
        print(f"Found reference file: {os.path.basename(ref_file)}")
        
        # Read the Excel file and extract column names from each sheet
        column_refs = {}
        excel_file = pd.ExcelFile(ref_file)
        
        level_prefix = valid_levels[level]
        
        for sheet_name in excel_file.sheet_names:
            # Only process sheets that match the selected level
            if not sheet_name.lower().startswith(level_prefix):
                continue
            
            try:
                # Read the sheet
                df = pd.read_excel(excel_file, sheet_name=sheet_name)
                
                # Store the column references
                # Usually first column is field name and second is description
                # But check the actual column names to be sure
                field_col = next(col for col in df.columns if 'field' in col.lower())
                desc_col = next(col for col in df.columns if 'description' in col.lower())
                
                column_refs[sheet_name] = df[[field_col, desc_col]].values.tolist()
                print(f"Processed reference sheet: {sheet_name}")
            except Exception as e:
                print(f"Warning: Could not process sheet {sheet_name}: {e}")
        
        # Optionally remove the reference file after processing
        os.remove(ref_file)
        
        return column_refs
        
    finally:
        driver.quit()