## Import Necessary Libraries and Functions
We are using Selenium - in particular webdriver, By, and NoSuchElementException, time, and os. 

In [5]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import os

In [None]:
def wait_for_downloads(variables, year, directory, timeout = 200):
    start_time = time.time()
    expected_files = []

    for var in variables:

        if year < 2021:
            expected_files.append(f"DIST{var}.dat" if var != "REF" else "DREF.dat")
        else:
            expected_files.append(f"DIST{var}.csv" if var != "REF" else "DREF.csv")

    while time.time() - start_time < timeout:
        downloaded_files = os.listdir(directory)
        
        if all(file in downloaded_files and not file.endswith(".crdownload") for file in expected_files):
            print(f"All downloads for {year} completed successfully.")
            print("")
            return True
        print("Waiting for all files to download...")
        time.sleep(5)

    return False

## Function for TAPR Scraping
This function takes two list inputs and a string:
* years is a list of YYYY integers (ex. 2018)
* variables is a list of strings where each string is a dataset available on TAPR (ex. PROF)
The function saves all .csv or .dat files to your local directory in separate directories for each year. 

In [71]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

def tapr_scraper3(years, variables):
    dir = os.getcwd()  # getting the current working directory to save the files to
    print(dir)
    for year in years:
        # access TAPR for each year
        dir_name = f"raw_data{year}" # name a directory to store the raw data for that particular year
        os.makedirs(dir_name, exist_ok=True) # create the directory unless it already exists
        chromeOptions = webdriver.ChromeOptions() #creating chrome options object
        prefs = {"download.default_directory" : f"{dir}/{dir_name}"}
        chromeOptions.add_experimental_option("prefs", prefs)
        driver = webdriver.Chrome(options=chromeOptions)  # all files that are downloaded in the driver will now download to your directory (instead of "Downloads" folder)
        driver.get(f"https://rptsvr1.tea.texas.gov/perfreport/tapr/{year}/download/DownloadData.html")  # open the TAPR url, depending on the year
        # select district
        district_select = driver.find_element(By.XPATH, "//input[@type='radio' and @name='sumlev' and @value='D']")  # select district level data
        district_select.click()  # click the button
        cant_download = []
        print(f"Downloading Data for {year}")
        roots = [] # initializing an empty list to keep track of the variables 
        for var in variables:  # for each data set
            print(f"Checking for DIST{var} data...")
            file_patterns = [f"DIST{var}.csv", f"DIST{var}.dat", f"D{var}.dat", f"D{var}.csv"]
            if any(os.path.isfile(f"{dir}/{dir_name}/{file}") for file in file_patterns):
                print(f"{var} already exists")
                cant_download.append(var)
                continue
            try:
                select_data = driver.find_element(By.XPATH, f"//input[@type='radio' and @name='setpick' and @value='{var}']")
                select_data.click()  # click the data button
                download = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Continue']")
                download.click()  # download the data
                print(f"{var} downloaded for {year}")
            except NoSuchElementException:
                print(f"{var} not found for {year}")  # in case it doesn't exist, such as PERF1 not existing in earlier data
                cant_download.append(var)
                continue
            if var == "REF": 
                roots.append("DREF")
            else:
                roots.append(f"DIST{var}")
        available = set(variables) - set(cant_download)
        if wait_for_downloads(available, year, f"{dir}/{dir_name}"):
            continue
        driver.quit()  # quit driver for that year


In [72]:
# Run the function with all currently available years and all the TAPR datasets
tapr_2018_2023 = list(range(2018, 2024)) # all years with data that is currently available

data_acronyms = ['PROF', 'PERF1', 'GRAD', 'STAAR1', 'REF', 'PERF'] # all the measures located on the TAPR website

tapr_scraper3(years = tapr_2018_2023, variables = data_acronyms)

/Users/biancaschutz/HERC_Sp25
Downloading Data for 2018
Checking for DISTPROF data...
PROF downloaded for 2018
Checking for DISTPERF1 data...
PERF1 not found for 2018
Checking for DISTGRAD data...
GRAD downloaded for 2018
Checking for DISTSTAAR1 data...
STAAR1 downloaded for 2018
Checking for DISTREF data...
REF downloaded for 2018
Checking for DISTPERF data...
PERF downloaded for 2018
All downloads for 2018 completed successfully.
Downloading Data for 2019
Checking for DISTPROF data...
PROF downloaded for 2019
Checking for DISTPERF1 data...
PERF1 not found for 2019
Checking for DISTGRAD data...
GRAD downloaded for 2019
Checking for DISTSTAAR1 data...
STAAR1 downloaded for 2019
Checking for DISTREF data...
REF downloaded for 2019
Checking for DISTPERF data...
PERF downloaded for 2019
All downloads for 2019 completed successfully.
Downloading Data for 2020
Checking for DISTPROF data...
PROF downloaded for 2020
Checking for DISTPERF1 data...
PERF1 not found for 2020
Checking for DISTGRAD