Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [RI DOH](http://www.health.ri.gov/data/schools/index.php).
<hr>

# Rhode Island Schools Immunization Rates

While RI Dept. of Health does a decent job at having their data public and accessible... though some determination and navigating of their website is needed. Some of their datasets stored in google sheets are link on their corresponding pages. Notably the [Fatal Drug Overdoses](https://docs.google.com/spreadsheets/d/1Q7HjIxi4VuWeAgPwhLZx15G1j54WgicRcHdu4Bs4S74/edit#gid=942637027), [Schools Lead Drinking Water](https://docs.google.com/spreadsheets/d/1SyUoMZwh9ZlZpulCRUzSAw8Boq9PuQR7BiGASFMChy4/edit#gid=736080368) and [Schools Oral Health Care](https://docs.google.com/spreadsheets/d/1alrBYEp-GvpC2QemOu_4DcbEWMvdu_inDilNNIuI4oA/edit#gid=366050811). However the [Schools Immunization Rates](https://docs.google.com/spreadsheets/d/1wcvZM_0_Cx8iSAn9HfCRu5OD50Ct7H0JJriAbBjE8EA/edit?usp=sharing) only has a summary view aggregated at the City/Town level.

This Notebook code is used to get the tables from RI Dept. of Health website using selenium and pandas then clean + export the tables.
<hr>

In [1]:
import time
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.support.select import Select

In [2]:
rename_dict = {
    'independent private':'private',
    'catholic school':'catholic',
    'charter school':'charter',
    'public school':'public',
}

keep_cols = ['School Type', 'city/town', 'year']

def get_table(school, year, idx):
    summary_rates = browser.find_elements_by_class_name('sortable')[idx].get_attribute('outerHTML')
    summary_rates = pd.read_html(summary_rates)[0]
    summary_rates['city/town'] = school
    summary_rates['year'] = year
    
    return summary_rates

def clean_data(df):
    df = df.replace('ND', np.nan)
    df = df.dropna()
    if 'School Type' in list(df):
        df = df[df['School Type'] != 'Statewide (All Schools)']
        df['School Type'] = df['School Type'].str.lower()
        df['School Type'] = df['School Type'].map(rename_dict).fillna(df['School Type'])
        
    if '% Students Fully Immunized Statewide*' in df.columns:
        df = df.drop(columns='% Students Fully Immunized Statewide*')
        
    return df

def melt_cols(df):
    df = df.melt(id_vars=keep_cols)
    df = df.rename(columns={'variable':'vaccine'})
    
    return df

In [3]:
# load browser and request webpage
browser = webdriver.Chrome('C:/Users/Alex/bin/chromedriver.exe')
url = 'http://www.health.ri.gov/data/schools/immunization/index.php'
browser.get(url)

In [4]:
# get list of school districts
school_select = '/html/body/div[4]/div[1]/div[2]/div[1]/div[2]/div/form/fieldset/select[1]'
schools = browser.find_elements_by_xpath(school_select)
schools = [x.text.split('\n') for x in schools]
schools = schools[0][1:]

# get list of years
year_select = browser.find_elements_by_id('schoolyear')
years = [x.text.split('\n') for x in year_select]
years = [x.strip() for x in years[0]]

In [5]:
immune_k = []
immune_7 = []
immune_8 = []
immune_9 = []
immune_12 = []
school_table = []
summary_table = []

for school in schools:
    # select school
    select = Select(browser.find_element_by_xpath(school_select))
    select.select_by_visible_text(school)
    
    for year in years:
        # select year
        select = Select(browser.find_element_by_id('schoolyear'))
        select.select_by_visible_text(year)
        
        # request page
        browser.find_element_by_id('submit').click()
        
        # check if no info and log errors
        error = 'No Immunization information found.'
        if error in browser.page_source:
            with open('./data/errors.txt', 'a') as f:
                f.write(f'{school}, {year}\n')
            browser.back()
            continue
            
        # summary table
        summary_table.append(get_table(school, year, 0))
        
        # grade k table
        immune_k.append(get_table(school, year, 1))
        
        # grade 7 table
        immune_7.append(get_table(school, year, 2))
        
        # grade 8 table
        immune_8.append(get_table(school, year, 3))
        
        # grade 9 table
        immune_9.append(get_table(school, year, 4))
        
        # grade 12 table
        immune_12.append(get_table(school, year, 5))
        
        # school-level table
        school_table.append(get_table(school, year, 6))
        
        # return to prior page
        browser.back()
    
    # pause for a bit
    time.sleep(6)
        
# cleans tables removing ND (Non disclosure) and statewide average
table_k = clean_data(pd.concat(immune_k, sort=False))
table_7 = clean_data(pd.concat(immune_7, sort=False))
table_8 = clean_data(pd.concat(immune_8, sort=False))
table_9 = clean_data(pd.concat(immune_9, sort=False))
table_12 = clean_data(pd.concat(immune_12, sort=False))
table_schools = clean_data(pd.concat(school_table, sort=False))
table_summary = clean_data(pd.concat(summary_table, sort=False))

# pivot vaccine columns
table_k = melt_cols(table_k)
table_7 = melt_cols(table_7)
table_8 = melt_cols(table_8)
table_9 = melt_cols(table_9)
table_12 = melt_cols(table_12)

# add grade
table_k['Grade'] = 'K'
table_7['Grade'] = '7'
table_8['Grade'] = '8'
table_9['Grade'] = '9'
table_12['Grade'] = '12'

# merge vaccine - grade tables
frames = [table_k, table_7, table_8, table_9, table_12]
vaccine = pd.concat(frames)

# get population table columns
cols = ['Grade', 'School Type', 'city/town', 'year', 'Total Students at Grade Level Assessed']
pop = table_schools[cols].copy()

# convert to int and get sum by district, grade, type
pop['Total Students at Grade Level Assessed'] = pop['Total Students at Grade Level Assessed'].astype(int)
pop = pop.groupby(cols[:-1]).sum().reset_index()

# merge datasets
vaccine = vaccine.merge(pop, how='left', on=cols[:-1])

# get (best guess) rounded number of students with specific vaccine
vaccine['value'] = vaccine['value'].str.rstrip('%').astype('float') / 100
vaccine['student_with_vaccine'] = round(vaccine['Total Students at Grade Level Assessed'] * vaccine['value'])

# remove nulls (columns with vaccine data but due to small number policy no student count)
vaccine = vaccine.dropna()

# save files in data folder
vaccine.to_csv('./data/clean/vaccine_immunization.csv', index=False)
table_schools.to_csv('./data/clean/clean_schools_immunization.csv', index=False)
table_summary.to_csv('./data/clean/summary_immunization_clean.csv', index=False)