Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [RIDE](http://www.eride.ri.gov/reports/reports.asp).
<hr>

# Rhode Island School Enrollment Metrics

This Notebook code uses pandas to clean the data collected from the scraper. The data removes the total row, decodes gender variables, adds year and maps district region. The "district region" is mapped from the same coding used in the UCOA data. Some data is surpressed due to small-number policy of RIDE, however this was only the case as of 2019.
<hr>

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import csv

In [2]:
# creating region dict and mapping region
region_map = dict(csv.reader(open("./data/files/region_map csv")))

In [3]:
def clean_data(id_vars, df):
    # removing Total rows
    df = df[df['grade'] != 'TO']

    # unpivoting
    df = df.melt(id_vars)
    
    # creating gender column
    df.loc[df['variable'].str.endswith('m'), 'gender'] = 'male'
    df.loc[df['variable'].str.endswith('f'), 'gender'] = 'female'
    df.loc[df['variable'].str.endswith('m'), 'variable'] = df['variable'].str[:-2]
    df.loc[df['variable'].str.endswith('f'), 'variable'] = df['variable'].str[:-2]
    
    # fill nulls
    df = df.fillna('n/a')
    
    # map region
    df['region'] = df['district'].map(region_map)
    
    return df

def merge_files(files, id_vars):
    frames = []
    for file in files:
        df = pd.read_csv(file)
        df = clean_data(id_vars, df)
        frames.append(df)
    df = pd.concat(frames)
    
    # fix years
    df['year'] = df['year'].str[:-3]
    
    return df

In [4]:
# school-level data
files = list(Path('./data/raw/school_level/enrollment/').glob('*.csv'))
id_vars = ['year', 'grade', 'district', 'school']

enrollments = merge_files(files, id_vars)
enrollments.to_csv('./data/clean/school_level/enrollments.csv', index=False)

In [5]:
# district-level data
files = list(Path('./data/raw/district_level/enrollment/').glob('*.csv'))
id_vars = ['year', 'grade', 'district']

enrollments = merge_files(files, id_vars)
enrollments.to_csv('./data/clean/district_level/enrollments.csv', index=False)