Created by [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [RIDE](https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx).

## Collecting and Cleaning School Attendance Data

Using Selenium to collect data on attendance of webapp and cleaning with pandas
<hr>

## Data Collection

In [1]:
import time
from datetime import date
from selenium import webdriver

In [2]:
# launching browser
options = webdriver.ChromeOptions() 
prefs = {'download.default_directory' : 'D:/Downloads/RI Attendance'}
options.add_experimental_option('prefs', prefs)

browser = webdriver.Chrome('', options=options)
browser.get('https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx')

# get length in months between start & end
#start_date = date(2008, 9, 3)
end_date = date.today()

len_month = abs((start_date.year - end_date.year) * 12 + start_date.month - end_date.month) + 1

# creating the loop for each month getting all schooldays
for i in range(len_month):
    
    # button to click to go to previous month
    prev_btn = '#calDateSelection > tbody > tr:nth-child(1) > td > table > tbody > tr > td:nth-child(1) > a'
    prev_month = browser.find_element_by_css_selector(prev_btn)
    
    # loop for each file to download
    for ele in browser.find_elements_by_xpath("//input[contains(@id,'_imgBtnXls')]"):

        # click button to download file
        ele.click()
        
        # wait 2 seconds
        time.sleep(2)
        
    # after loop move to next month and wait 5 seconds
    prev_month.click()
    time.sleep(5)

## Data Cleaning

In [3]:
import pandas as pd
import numpy as np
import functions
import glob
import csv

In [4]:
# created functions
bin_date = functions.bin_date

# creating region dict
region_map = dict(csv.reader(open("./files/region_map.csv")))

In [5]:
# filepath of all individual date csvs
files = glob.glob('./data/raw/*.csv')

# combining files into 1 large df
frames = []
for f in files:
    df = pd.read_csv(f)
    frames.append(df)
df = pd.concat(frames)

# replacing "NR" (not reported) with nulls
df = df.replace('NR', np.nan)

# convert types
df['Enrollment'] = df['Enrollment'].astype(float)
df['Absent'] = df['Absent'].astype(float)
df['PercentageAbsent'] = df['PercentageAbsent'].astype(float)

# convert to datetime
df['AttendanceDate'] = pd.to_datetime(df['AttendanceDate'])

# setting school year dates
df['school_year'] = df.AttendanceDate.apply(bin_date)

# converting percent
df['PercentageAbsent'] = df['PercentageAbsent']/100

# setting region codes
df['region'] = df.LEA_NAME.map(region_map)

# save output
df.to_csv('./data/clean/attendance_clean.csv', index=False)