## Import Necessary Libraries and Functions
We are using Selenium - in particular webdriver, By, and NoSuchElementException, time, and os. 

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import os

## Function for TAPR Scraping
This function takes two list inputs and a string:
* years is a list of YYYY integers (ex. 2018)
* variables is a list of strings where each string is a dataset available on TAPR (ex. PROF)
* directory_path_name is a string that is a path name of a directory where you would like to save the data
The function saves all .csv or .dat files to your local directory in separate directories for each year. 

In [None]:
def tapr_scraper(years, variables, directory_path_name):
    for year in years:
    # access TAPR for each year
        dir_name = f"raw_data{year}" # name a directory to store the raw data for that particular year
        os.makedirs(dir_name, exist_ok=True) # create the directory unless it already exists
        chromeOptions = webdriver.ChromeOptions() #creating chrome options object
        prefs = {"download.default_directory" : f"{directory_path_name}/{dir_name}"}
        chromeOptions.add_experimental_option("prefs",prefs) # adding the directory - this directory will created wherever you are running the script
        driver = webdriver.Chrome(options=chromeOptions) # all files that are downloaded in the driver will now download to your directory (instead of "Downloads" folder)
        driver.get(f"https://rptsvr1.tea.texas.gov/perfreport/tapr/{year}/download/DownloadData.html") # open the TAPR url, depending on the year
        # select district
        district_select = driver.find_element(By.XPATH, "//input[@type='radio' and @name='sumlev' and @value='D']") # select district level data
        district_select.click() # click the button
        already_saved_or_not_available = 0
        print(f"Downloading Data for {year}")
        for var in variables: # for each data set
            print(f"Checking for DIST{var} data...")
            file_patterns = [f"DIST{var}.csv", f"DIST{var}.dat", f"D{var}.dat", f"D{var}.csv"]
            if any(os.path.isfile(f"{directory_path_name}/{dir_name}/{file}") for file in file_patterns):
                print(f"{var} already exists")
                already_saved_or_not_available += 1
                continue
            try:
                select_data = driver.find_element(By.XPATH, f"//input[@type='radio' and @name='setpick' and @value='{var}']")
                select_data.click() # click the data button
                download = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Continue']")
                download.click() # download the data
                print(f"Downloaded {var} for {year}")
            except NoSuchElementException:
                print(f"{var} not found for {year}") # in case it doesn't exist, such as PERF1 not existing in earlier data
                already_saved_or_not_available += 1
                continue
        if already_saved_or_not_available != len(variables):
            time.sleep(60) # wait for all files to download before quitting the driver
        driver.quit() # quit driver for that year


In [None]:
# Run the function with all currently available years and all the TAPR datasets
tapr_2018_2023 = list(range(2018, 2024)) # all years with data that is currently available

data_acronyms = ['PROF', 'PERF1', 'GRAD', 'STAAR1', 'REF', 'PERF'] # all the measures located on the TAPR website

tapr_scraper(years = tapr_2018_2023, variables = data_acronyms, directory_path_name = "/Users/biancaschutz/HERC")