## Comprehensive District Data Scraper
This scraper allows users to select the level and type of data they would like to download from the TAPR Advanced Data Download on the TEA website. If the level is "D" for District, district type data will also be downloaded in addition to the TAPR data unless the user has indicated they do not want the data (set dist_type = False). 

If the files already exist, the scraper will not download new files. 


The scraper creates separate folders for each year of data and names the files with the appropriate year. 

### Import Libraries

In [104]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

### District Type Scraper

In [105]:
def district_type_scraper(year):
   school_year = str(year-1)+"-"+str(year-2000)
   url = f'https://tea.texas.gov/reports-and-data/school-data/district-type-data-search/district-type-{school_year}'
   grab = requests.get(url)
   soup = BeautifulSoup(grab.text, 'html.parser')
   xlsx = []
   for link in soup.find_all("a"):
      data = str(link.get('href'))
      if re.search(".xlsx$", data):
         return pd.read_excel(f"https://tea.texas.gov{data}", sheet_name= 2)

In [106]:
def wait_for_downloads(variables, year, directory, timeout = 200):
    start_time = time.time()
    expected_files = []

    for var in variables:

        if year < 2021:
            expected_files.append(f"DIST{var}.dat" if var != "REF" else "DREF.dat")
        else:
            expected_files.append(f"DIST{var}.csv" if var != "REF" else "DREF.csv")
    check = 1
    while time.time() - start_time < timeout:
        downloaded_files = os.listdir(directory)
        
        if all(file in downloaded_files and not file.endswith(".crdownload") for file in expected_files):
            print(f"All downloads for {year} completed successfully.")
            print("")
            return True
        if check == 1: 
            print("Waiting for all files to download...")
        check += 1
        time.sleep(5)

    return False

In [107]:
def file_renamer(directory, year, prefix, var, level):
    for ext in ['.csv', '.dat']:
        old_patterns = [
               f"{prefix}{var}{ext}",
               f"{level}{var}{ext}"  # Some files might not include the prefix
                ]
                        
        for old_pattern in old_patterns:
            old_name = os.path.join(directory, old_pattern)
            if os.path.exists(old_name):
                if var == "REF":
                    new_name = os.path.join(directory, f"{level}{var}_{year}{ext}")
                else: 
                    new_name = os.path.join(directory, f"{prefix}{var}_{year}{ext}")
                os.rename(old_name, new_name)
                break

### Scraper

In [110]:
def tea_scraper(years, variables, level, dist_type = True):
    """
    Scrape all HERC data for specified years, variables, and level of data.
    
    Parameters:
    years (list): List of years to scrape data for (formatted YYYY)
    variables (list): List of variable codes to download (such as "GRAD")
    level (str): Administrative level to scrape. Options:
        'C' for Campus
        'D' for District
        'R' for Region
        'S' for State
    """
    directory_path_name = os.getcwd()
    # Validation for level parameter
    valid_levels = {
        'C': 'Campus',
        'D': 'District',
        'R': 'Region',
        'S': 'State'
    }
    
    if level not in valid_levels:
        raise ValueError(f"Invalid level. Must be one of: {', '.join(valid_levels.keys())}")
    
    # Create prefix for filenames based on level
    file_prefix = {
        'C': 'CAMP',
        'D': 'DIST',
        'R': 'REGN',
        'S': 'STATE'
    }[level]
    
    for year in years:
        ### TAPR DATA DOWNLOAD ###
        # Create full path for year directory
        dir_name = f"raw_data{year}"
        full_dir_path = os.path.join(directory_path_name, dir_name)
        os.makedirs(full_dir_path, exist_ok=True)
        
        # Configure Chrome options
        chrome_options = webdriver.ChromeOptions()
        absolute_download_path = os.path.abspath(full_dir_path)
        
        # Add additional Chrome preferences to prevent download prompts
        prefs = {
            "download.default_directory": absolute_download_path,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True,
            "profile.default_content_settings.popups": 0
        }
        chrome_options.add_experimental_option("prefs", prefs)
        
        # Add additional Chrome arguments
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(f"https://rptsvr1.tea.texas.gov/perfreport/tapr/{year}/download/DownloadData.html")
        
        # Select appropriate level
        level_select = driver.find_element(By.XPATH, f"//input[@type='radio' and @name='sumlev' and @value='{level}']")
        level_select.click()
        
        unavailable = []
        print(f"Downloading {valid_levels[level]} Level TAPR Data for {year}...")
        
        for var in variables:
            print(f"Checking for {file_prefix}{var} data...")
            # Updated file patterns to include level prefix and year
            file_patterns = [
                f"{file_prefix}{var}_{year}.csv",
                f"{file_prefix}{var}_{year}.dat",
                f"{level}{var}_{year}.dat",  # Some files might not include the prefix
                f"{level}{var}_{year}.csv"
            ]
            
            if any(os.path.isfile(os.path.join(full_dir_path, file)) for file in file_patterns):
                print(f"{var}_{year} already exists")
                unavailable.append(var)
                continue
                
            try:
                select_data = driver.find_element(By.XPATH, f"//input[@type='radio' and @name='setpick' and @value='{var}']")
                select_data.click()
                
                # Add a small delay after clicking the radio button
                time.sleep(1)
                
                download = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Continue']")
                download.click()
                print(f"Downloaded {level if var == 'REF' else file_prefix}{var} for {year}")
                
            except NoSuchElementException:
                print(f"{var} not found for {year}")
                unavailable.append(var)
                continue
        
        available_vars = set(variables) - set(unavailable)
        # do not shut down driver until time-out occurs or all available files have finished downloading
        if wait_for_downloads(variables = available_vars, year = year, directory = full_dir_path):
            for a_var in available_vars:
                file_renamer(directory = full_dir_path, year = year, prefix = file_prefix, var = a_var, level = level)  
        driver.quit()

        ### DISTRICT TYPE DATA DOWNLOAD ###
        if level == "D" and dist_type:
            print(f"Downloading District Type Data for {year}...")
            
            if os.path.isfile(os.path.join(full_dir_path, f"district_type{year}.csv")):
                print(f"District Type Data for {year} already exists") # don't download if it already exists
                print("")
                continue

            df = district_type_scraper(year) # get the dataframe with the sheet data
            df.to_csv(f"{dir_name}/district_type{year}.csv") # save it to the raw_data{year} folder

            print(f"Downloaded District Type Data for {year}")
            print("")
    print("All Data Downloaded!")
        

In [109]:
# Run the function with all currently available years and all the TAPR datasets
tapr_2018_2023 = list(range(2018, 2024)) # all years with data that is currently available

data_acronyms = ['PROF', 'PERF1', 'GRAD', 'STAAR1', 'REF', 'PERF'] # all the measures located on the TAPR website

tea_scraper(years = tapr_2018_2023, variables = data_acronyms, level = "D")

Downloading District Level TAPR Data for 2018...
Checking for DISTPROF data...
PROF_2018 already exists
Checking for DISTPERF1 data...
PERF1 not found for 2018
Checking for DISTGRAD data...
GRAD_2018 already exists
Checking for DISTSTAAR1 data...
STAAR1_2018 already exists
Checking for DISTREF data...
REF_2018 already exists
Checking for DISTPERF data...
PERF_2018 already exists
All downloads for 2018 completed successfully.

Downloading District Type Data for 2018...
District Type Data for 2018 already exists

Downloading District Level TAPR Data for 2019...
Checking for DISTPROF data...
PROF_2019 already exists
Checking for DISTPERF1 data...
PERF1 not found for 2019
Checking for DISTGRAD data...
GRAD_2019 already exists
Checking for DISTSTAAR1 data...
STAAR1_2019 already exists
Checking for DISTREF data...
REF_2019 already exists
Checking for DISTPERF data...
PERF_2019 already exists
All downloads for 2019 completed successfully.

Downloading District Type Data for 2019...
District T