In [38]:
import pandas as pd
import numpy as np
import glob
import os

In [39]:
#NECAP Cleaning

fp = './raw/NECAP'

allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
    
        df = pd.read_csv(csv, skiprows=3)
        df['filename'] = os.path.basename(csv)
        df['category'] = 'NECAP'
        df = df.iloc[1:]
        
        # Unpivot the columns
        x = ['District', 'School', 'category', 'filename']
        df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
        
        df = df.rename(index=str, columns={'variable': 'year'})
        df['year'] = df.year.str[13:-3]
        
        df = df.rename(index=str, columns={'filename': 'subject'})
        df['subject'] = df.subject.str[7:-12]
        
        df = df.replace('no data', np.nan)
        df = df.replace('too few data', np.nan)
        df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
        
        df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
        df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
        
        data.append(df)
frame = pd.concat(data)

cols = ['District', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]

necap = frame
necap.to_csv('./clean/necap_clean.csv', index=False)

In [40]:
necap.head()

Unnamed: 0,District,School,category,subject,year,value,rank,percentile
0,Barrington,Barrington High School,NECAP,11th Grade Math,2008,0.71,1.0,1.0
1,Bristol Warren,Mt. Hope High School,NECAP,11th Grade Math,2008,0.4,9.0,0.827586
2,Burrillville,Burrillville High School,NECAP,11th Grade Math,2008,0.3,16.0,0.672414
3,Central Falls,Central Falls High School,NECAP,11th Grade Math,2008,0.04,32.0,0.206897
4,Chariho,Chariho Regional High School,NECAP,11th Grade Math,2008,0.31,15.0,0.724138


In [41]:
#PRACC Cleaning

fp = './raw/PRACC'

allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
    
        df = pd.read_csv(csv, skiprows=3)
        df['filename'] = os.path.basename(csv)
        df['category'] = 'PRACC'
        df = df.iloc[1:]
        
        # Unpivot the columns
        x = ['District', 'School', 'category', 'filename']
        df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
        
        df = df.rename(index=str, columns={'variable': 'year'})
        df['year'] = df.year.str[29:-3]
        
        df = df.rename(index=str, columns={'filename': 'subject'})
        df['subject'] = df.subject.str[7:-12]
        
        df = df.replace('no data', np.nan)
        df = df.replace('too few data', np.nan)
        df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
        
        df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
        df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
        
        data.append(df)
frame = pd.concat(data)

cols = ['District', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]

pracc = frame
pracc.to_csv('./clean/pracc_clean.csv', index=False)

In [42]:
pracc.head()

Unnamed: 0,District,School,category,subject,year,value,rank,percentile
0,Barrington,Barrington High School,PRACC,10th Grade ELA_Literacy,2014,0.751,2.0,0.983871
1,Bristol Warren,Mt. Hope High School,PRACC,10th Grade ELA_Literacy,2014,0.2418,31.0,0.516129
2,Burrillville,Burrillville High School,PRACC,10th Grade ELA_Literacy,2014,0.2636,29.0,0.548387
3,Central Falls,Central Falls High School,PRACC,10th Grade ELA_Literacy,2014,,52.0,0.096774
4,Chariho,Chariho Regional High School,PRACC,10th Grade ELA_Literacy,2014,0.422,15.0,0.774194


In [43]:
#SAT Cleaning

fp = './raw/SAT'

allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
    
        df = pd.read_csv(csv, skiprows=3)
        df['filename'] = os.path.basename(csv)
        df['category'] = 'SAT'
        df = df.iloc[1:]
        
        df = df.replace('no data',np.nan)
        df = df.replace('too few data',np.nan)
        
        col_list = ['Mathematics Average', 'Reading Average', 'Writing Average']
        df['Total Average'] = df[col_list].astype(float).sum(axis=1)
        
        # Unpivot the columns
        x = ['District', 'School', 'category', 'filename']
        df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
        
        df = df.rename(index=str, columns={'variable': 'subject'})
        df['subject'] = df.subject.str[:-7]
        df['subject'] = 'SAT ' + df['subject'].astype(str) + 'Avg'
        
        df = df.rename(index=str, columns={'filename': 'year'})
        df['year'] = df.year.str[:-9]
        
        df['value'] = df['value'].astype('float')
        df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
        df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
        
        
        data.append(df)
frame = pd.concat(data)

cols = ['District', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]

sat = frame
sat.to_csv('./clean/sat_clean.csv', index=False)

In [44]:
sat.head()

Unnamed: 0,District,School,category,subject,year,value,rank,percentile
0,Barrington,Barrington High School,SAT,SAT Mathematics Avg,2010,576.0,2.0,0.982456
1,Bristol Warren,Mt. Hope High School,SAT,SAT Mathematics Avg,2010,484.0,19.0,0.622807
2,Burrillville,Burrillville High School,SAT,SAT Mathematics Avg,2010,488.0,18.0,0.649123
3,Central Falls,Central Falls High School,SAT,SAT Mathematics Avg,2010,374.0,39.0,0.263158
4,Chariho,Chariho Regional High School,SAT,SAT Mathematics Avg,2010,520.0,9.0,0.842105


In [45]:
# Graduation Rate

fp = './raw/Graduation Rate'

allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
    
        df = pd.read_csv(csv, skiprows=4)
        df['filename'] = os.path.basename(csv)
        df['category'] = 'GRAD'
        df = df.iloc[1:, [0,1,2,6,7]]
        
        # Unpivot the columns
        x = ['District', 'School', 'category', 'filename']
        df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
        
        df = df.rename(index=str, columns={'variable': 'subject'})
        df['subject'] = 'Pct. Graduated 4 Yrs'
        
        df = df.rename(index=str, columns={'filename': 'year'})
        df['year'] = df.year.str[0:4]
        
        df = df.replace('no data', np.nan)
        df = df.replace('too few data', np.nan)
        df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
        
        df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
        df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
        
        
        data.append(df)
frame = pd.concat(data)

cols = ['District', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]

grad = frame
grad.to_csv('./clean/grad_clean.csv', index=False)

In [46]:
grad.head()

Unnamed: 0,District,School,category,subject,year,value,rank,percentile
0,Barrington,Barrington High School,GRAD,Pct. Graduated 4 Yrs,2010,0.966,2.0,0.982456
1,Bristol Warren,Mt. Hope High School,GRAD,Pct. Graduated 4 Yrs,2010,0.881,14.0,0.754386
2,Burrillville,Burrillville High School,GRAD,Pct. Graduated 4 Yrs,2010,0.868,16.0,0.719298
3,Central Falls,Central Falls High School,GRAD,Pct. Graduated 4 Yrs,2010,0.708,37.0,0.315789
4,Chariho,Chariho Regional High School,GRAD,Pct. Graduated 4 Yrs,2010,0.893,10.0,0.824561


In [47]:
# Attendance

fp = './raw/Attendance'

allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
    
        df = pd.read_csv(csv, skiprows=4)
        df['filename'] = os.path.basename(csv)
        df['category'] = 'Attend'
        df = df.iloc[1:]
        
        # Unpivot the columns
        x = ['District', 'School', 'category', 'filename']
        df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
        
        df = df.rename(index=str, columns={'variable': 'subject'})
        df['subject'] = 'Attendance Rate'
        
        df = df.rename(index=str, columns={'filename': 'year'})
        df['year'] = df.year.str[0:4]
        
        df = df.replace('no data', np.nan)
        df = df.replace('too few data', np.nan)
        df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
        
        df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
        df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
        
        
        data.append(df)
frame = pd.concat(data)

cols = ['District', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]

attend = frame
attend.to_csv('./clean/attend_clean.csv', index=False)

In [48]:
attend.head()

Unnamed: 0,District,School,category,subject,year,value,rank,percentile
0,Barrington,Hampden Meadows School,Attend,Attendance Rate,2010,0.9659,21.0,0.887179
1,Barrington,Nayatt School,Attend,Attendance Rate,2010,0.9585,50.0,0.712821
2,Barrington,Primrose Hill School,Attend,Attendance Rate,2010,0.962,36.0,0.810256
3,Barrington,Sowams Elementary School,Attend,Attendance Rate,2010,0.9638,26.0,0.861538
4,Bristol Warren,Colt Andrews School,Attend,Attendance Rate,2010,0.9494,92.0,0.425641


In [49]:
# Chronic Absence

fp = './raw/Chronic Absenteeism'

allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
    
        df = pd.read_csv(csv, skiprows=4)
        df['filename'] = os.path.basename(csv)
        df['category'] = 'Chronic'
        df = df.iloc[1:]
        
        # Unpivot the columns
        x = ['District', 'School', 'category', 'filename']
        df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
        
        df = df.rename(index=str, columns={'variable': 'subject'})
        df['subject'] = 'Chronic Absence'
        
        df = df.rename(index=str, columns={'filename': 'year'})
        df['year'] = df.year.str[0:4]
        
        df = df.replace('no data', np.nan)
        df = df.replace('too few data', np.nan)
        df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
        
        df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
        df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
        
        
        data.append(df)
frame = pd.concat(data)

cols = ['District', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]

chronic = frame
chronic.to_csv('./clean/chronic_clean.csv', index=False)

In [50]:
chronic.head()

Unnamed: 0,District,School,category,subject,year,value,rank,percentile
0,Barrington,Barrington High School,Chronic,Chronic Absence,2010,0.0553,227.0,0.209302
1,Barrington,Barrington Middle School,Chronic,Chronic Absence,2010,0.0474,238.0,0.166113
2,Barrington,Hampden Meadows School,Chronic,Chronic Absence,2010,0.0358,256.0,0.10299
3,Barrington,Nayatt School,Chronic,Chronic Absence,2010,0.0541,228.0,0.20598
4,Barrington,Primrose Hill School,Chronic,Chronic Absence,2010,0.0433,245.0,0.142857


In [51]:
regoion_map = {'Barrington' : 'Suburban',
            'Bristol Warren' : 'Regional',
            'Burrillville' : 'Suburban',
            'Central Falls' : 'Urban',
            'Chariho' : 'Regional',
            'Coventry' : 'Suburban',
            'Cranston' : 'Urban Ring',
            'Cumberland' : 'Suburban',
            'East Greenwich' : 'Suburban',
            'East Providence' : 'Urban Ring',
            'Exeter-West Greenwich' : 'Regional',
            'Foster' : 'Suburban',
            'Foster-Glocester' : 'Regional',
            'Glocester' : 'Suburban',
            'Jamestown' : 'Suburban',
            'Johnston' : 'Urban Ring',
            'Lincoln' : 'Suburban',
            'Little Compton' : 'Suburban',
            'Middletown' : 'Suburban',
            'Narragansett' : 'Suburban',
            'New Shoreham' : 'Suburban',
            'Newport' : 'Urban Ring',
            'North Kingstown' : 'Suburban',
            'North Providence' : 'Urban Ring',
            'North Smithfield' : 'Suburban',
            'Pawtucket' : 'Urban',
            'Portsmouth' : 'Suburban',
            'Providence' : 'Urban',
            'Scituate' : 'Suburban',
            'Smithfield' : 'Suburban',
            'South Kingstown' : 'Suburban',
            'Tiverton' : 'Suburban',
            'Warwick' : 'Urban Ring',
            'West Warwick' : 'Urban Ring',
            'Westerly' : 'Suburban',
            'Woonsocket' : 'Urban',
            'Independent Charter School' : 'Charter',
            'State Operated School' : 'State'}

In [52]:
school_region_map =  {
    'Academy for Career Exploration' : 'Charter', 
    "NE Laborers'/Cranston Public Schools Construction Career Academy" : 'Charter',
    'Times2 Academy' : 'Charter'}

In [72]:
# Combine ALl

fp = './clean/'

allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
    
        df = pd.read_csv(csv)     
        
        data.append(df)
frame = pd.concat(data)

cols = ['District', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]

final = frame

final['region'] = final.District.map(regoion_map)
final['region2'] = final.School.map(school_region_map)

final.region2.fillna(final.region, inplace=True)

final = final.drop(['region'], axis=1)

final = final.rename(columns={'region2': 'region'})

final = final[final.District != 'Regional Collaborative']

cols = ['District', 'region', 'School', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
final = final[cols]

final.to_csv('./clean/final_clean.csv', index=False)

In [73]:
final.head()

Unnamed: 0,District,School,category,subject,year,value,rank,percentile,region
0,Barrington,Hampden Meadows School,Attend,Attendance Rate,2010,0.9659,21.0,0.887179,Suburban
1,Barrington,Nayatt School,Attend,Attendance Rate,2010,0.9585,50.0,0.712821,Suburban
2,Barrington,Primrose Hill School,Attend,Attendance Rate,2010,0.962,36.0,0.810256,Suburban
3,Barrington,Sowams Elementary School,Attend,Attendance Rate,2010,0.9638,26.0,0.861538,Suburban
4,Bristol Warren,Colt Andrews School,Attend,Attendance Rate,2010,0.9494,92.0,0.425641,Regional
