Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [RIDE](http://www.eride.ri.gov/reports/reports.asp).
<hr>

# Rhode Island School Enrollment Metrics

This Notebook code uses Selenium to automate collecting all the enrollment, dropout and graduation tables from data at the school, district and state level. The higher level data is needed to look at district/state wide historical changes due to closed schools being removed completely from the data even in years the school was present.  

Example: Warwick Vets High School doesn't exist despite being open from 1956-2016.
<hr>

In [1]:
import time
import random
import numpy as np
import pandas as pd
from pathlib import Path

from selenium import webdriver
from selenium.webdriver.support.select import Select

In [2]:
report_type = {
    'Dropouts': 'dropout',
    'October Enrollment': 'enrollment',
    'Graduates - Diploma': 'graduation'
}

In [3]:
def save_file(report, level, df, year):
    # make filepath
    report = report_type[report]
    file_path = f'./data/raw/{level}/{report}/'
    Path(file_path).mkdir(parents=True, exist_ok=True)
    
    # save file
    file = f'./data/raw/{level}/{report}/{year}.csv'
    
    if not Path(file).is_file():
        df.to_csv(file, index=False)
    else:
        df.to_csv(file, mode='a', header=False, index=False)

In [4]:
# load browser and request webpage
browser = webdriver.Chrome('C:/Users/Alex/bin/chromedriver.exe')
url = 'http://www.eride.ri.gov/reports/reports.asp'
browser.get(url)

In [5]:
# get all districts from first dropdown box
districts = browser.find_element_by_name('selDistrict')
options = districts.find_elements_by_tag_name("option")
districts = [x.text for x in options]

# remove "whole state"
districts = districts[1:]

In [6]:
# list of year options
years = browser.find_element_by_name('selYear')
options = years.find_elements_by_tag_name("option")
years = [x.text for x in options]

# remove "Select a school year"
years = years[1:]

# get current year by files
current = list(Path('./data/raw/school_level/enrollment/').glob('*.csv'))
current = [x.name[:-4] for x in current]

# get final years not previously scraped
years = [x for x in years if x not in current]

# wait time between grabbing reports
wait_time = round(random.uniform(2.0, 5.0), 1)

In [7]:
# list of wanted reports
reports = [
    'Dropouts', 
    'October Enrollment',
    'Graduates - Diploma'
]

# column headers
headers = [
    'year',
    'grade',
    'native_american_m',
    'native_american_f',
    'asian_pacific_m',
    'asian_pacific_f',
    'black_m',
    'black_f',
    'white_m',
    'white_f',
    'hispanic_m',
    'hispanic_f',
    'multi_race_m',
    'multi_race_f',
    'iep_m',
    'iep_f',
    'frl_m',
    'frl_f',
    'lep_m',
    'lep_f',
    'total'
]

In [8]:
## Scraping school-level data

# loop through all districts
for district in districts:
    # select district
    select = Select(browser.find_element_by_name("selDistrict"))
    select.select_by_visible_text(district)
    
    # get all schools within selected district
    schools = browser.find_element_by_name('selSchool')
    options = schools.find_elements_by_tag_name("option")
    schools = [x.text for x in options]

    # remove "all schools"
    schools = schools[1:]

    # loop through all schools in district
    for school in schools:
        select = Select(browser.find_element_by_name("selSchool"))
        select.select_by_visible_text(school)

        #loop through the reports
        for report in reports:
            select = Select(browser.find_element_by_name("selReport"))
            select.select_by_visible_text(report)

            # loop through each year of reports
            for year in years:
                select = Select(browser.find_element_by_name("selYear"))
                select.select_by_visible_text(year)

                # click create report
                browser.find_element_by_name('B1').click()
                time.sleep(1)

                # grab the table and check if its empty
                table = browser.find_element_by_xpath('/html/body/table[2]')

                if table.text.endswith('Report is not available'):
                    time.sleep(wait_time)
                    continue

                # grab the information for the table
                table = table.get_attribute('innerHTML')
                dfs = pd.read_html(table)
                df = pd.concat(dfs)

                # fixing column headers
                df = df.iloc[3:]
                df.columns = headers

                # adding school and district
                df['district'] = district
                df['school'] = school
                
                # save file and wait
                save_file(report, 'school_level', df, year)
                time.sleep(wait_time)

In [9]:
## scraping district-level data

# select district
for district in districts:
    select = Select(browser.find_element_by_name("selDistrict"))
    select.select_by_visible_text(district)
    
    # select reports
    for report in reports:
        select = Select(browser.find_element_by_name("selReport"))
        select.select_by_visible_text(report)

        # select years
        for year in years:
            select = Select(browser.find_element_by_name("selYear"))
            select.select_by_visible_text(year)

            # click create report
            browser.find_element_by_name('B1').click()
            time.sleep(1)

            # grab the table and check if its empty
            table = browser.find_element_by_xpath('/html/body/table[2]')

            if table.text.endswith('Report is not available'):
                time.sleep(wait_time)
                continue

            # grab the information for the table
            table = table.get_attribute('innerHTML')
            dfs = pd.read_html(table)
            df = pd.concat(dfs)

            # fixing column headers
            df = df.iloc[3:]
            df.columns = headers
            
            # adding school and district
            df['district'] = district

            # save file and wait
            save_file(report, 'district_level', df, year)
            time.sleep(wait_time)

In [10]:
## scraping whole-state level data

# select whole state
select = Select(browser.find_element_by_name("selDistrict"))
select.select_by_visible_text("Whole state")

# select reports
for report in reports:
    select = Select(browser.find_element_by_name("selReport"))
    select.select_by_visible_text(report)
    
    # select years
    for year in years:
        select = Select(browser.find_element_by_name("selYear"))
        select.select_by_visible_text(year)
                
        # click create report
        browser.find_element_by_name('B1').click()
        time.sleep(1)

        # grab the table and check if its empty
        table = browser.find_element_by_xpath('/html/body/table[2]')

        if table.text.endswith('Report is not available'):
            time.sleep(wait_time)
            continue

        # grab the information for the table
        table = table.get_attribute('innerHTML')
        dfs = pd.read_html(table)
        df = pd.concat(dfs)

        # fixing column headers
        df = df.iloc[3:]
        df.columns = headers

        # save file and wait
        save_file(report, 'state_level', df, year)
        time.sleep(wait_time)