In [2]:
import pandas as pd
import numpy as np

print('Pandas version', pd.__version__)
print('Numpy version', np.__version__)

Pandas version 0.20.3
Numpy version 1.14.0


# Load Data

## Read in 2017-2018 Accountability Report (with 2016-2017 data)

#### Functions to load in data

In [82]:
def drop_first_two_rows(df):
    """
    Takes in a dataframe and drops the first two rows, returning a re-indexed dataframe
    """

    df.drop(df.index[:2], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df
    

In [83]:
def find_relevant_attributes(file, avg_attend_col, grad_rate_col):
    """
    Takes in a file with school attributes and returns a new dataframe with 
    the school id, school name, avg daily attendance, and graduation rates
    """
    school_id = file.iloc[:,0]
    school_name = file.iloc[:,1]
    avg_attend = file.iloc[:,avg_attend_col]
    grad_rate = file.iloc[:,grad_rate_col]
    
    schools = pd.DataFrame({'school_id': school_id, 
                        'school_name': school_name, 
                        'avg_daily_attend': avg_attend, 
                        'grad_rate': grad_rate})
    
    return schools
    

In [84]:
def fetch_school_data(filepath, sheetname, avg_attend_col, grad_rate_col):
    
    """
    Read in an excel doc sheet
    """
    
    assert type(avg_attend_col) == int, 'Please enter col position as integer'
    assert type(grad_rate_col) == int, 'Please enter col position as integer'
    
    file = pd.read_excel('../data/{}'.format(filepath), sheetname=sheetname)
    relevant_attributes = find_relevant_attributes(file, avg_attend_col, grad_rate_col)
    schools = drop_first_two_rows(relevant_attributes)
    
    return schools

#### Read in high school and combo schools, add a type column and concat together

In [116]:
high_schools = fetch_school_data(filepath='Accountability_SQRPratings_2017-2018_SchoolLevel.xls',
                                sheetname='High Schools (grds 9-12 only)',
                                avg_attend_col=56,
                                grad_rate_col=41)

high_schools['type'] = 'high_school'

In [118]:
combo_schools = fetch_school_data(filepath='Accountability_SQRPratings_2017-2018_SchoolLevel.xls',
                                sheetname='Combo Schools (grds PreK-12)',
                                avg_attend_col=129,
                                grad_rate_col=114)

combo_schools['type'] = 'combo_school'

In [119]:
schools = pd.concat([high_schools, combo_schools], ignore_index=True)

In [122]:
schools.sample(15)

Unnamed: 0,avg_daily_attend,grad_rate,school_id,school_name,type
69,93.6,90.6,609737,VON STEUBEN HS,high_school
44,93.7,95.9,609693,WESTINGHOUSE HS,high_school
66,87.6,73.0,609732,STEINMETZ HS,high_school
126,93.9,77.9,400162,INTRINSIC HS,combo_school
47,89.1,66.9,609698,BOGAN HS,high_school
14,95.0,81.0,400085,ACERO - GARCIA HS,high_school
60,75.5,40.4,609722,MANLEY HS,high_school
27,89.3,64.2,400118,NOBLE - DRW HS,high_school
120,91.5,81.8,400061,PERSPECTIVES - LEADERSHIP HS,combo_school
34,91.7,,400172,ASPIRA - BUSINESS & FINANCE HS,high_school


In [126]:
schools.grad_rate.median()

76.8

## Read in 2016-2017 School Profile Info

In [128]:
profiles = pd.read_csv('../data/Chicago_Public_Schools_-_School_Profile_Information_SY1617.csv', encoding = "ISO-8859-1")

In [129]:
profiles.head()

Unnamed: 0,School_ID,Legacy_Unit_ID,Finance_ID,Short_Name,Long_Name,School_Type,Primary_Category,Is_High_School,Is_Middle_School,Is_Elementary_School,...,Third_Contact_Name,Fourth_Contact_Title,Fourth_Contact_Name,Fifth_Contact_Title,Fifth_Contact_Name,Sixth_Contact_Title,Sixth_Contact_Name,Seventh_Contact_Title,Seventh_Contact_Name,Location
0,610158,5720,29271,SAYRE,Harriet E Sayre Elementary Language Academy,Neighborhood,ES,N,Y,Y,...,,,,,,,,,,"1850 N NEWLAND AVE\rChicago, Illinois 60707\r(..."
1,610282,7040,26301,MCNAIR,Ronald E McNair Elementary School,Neighborhood,ES,N,Y,Y,...,Tasia White,,,,,,,,,"4820 W WALTON ST\rChicago, Illinois 60651\r(41..."
2,609996,4020,23821,HOLDEN,Charles N Holden Elementary School,Neighborhood,ES,N,Y,Y,...,,,,,,,,,,"1104 W 31ST ST\rChicago, Illinois 60608\r(41.8..."
3,400079,3344,66395,ACERO - ZIZUMBO,Acero Charter Schools - SPC Daniel Zizumbo,Charter,ES,N,Y,Y,...,,,,,,,,,,"4248 W 47TH ST\rChicago, Illinois 60632\r(41.8..."
4,610089,5020,24621,MURPHY,John B Murphy Elementary School,Neighborhood,ES,N,Y,Y,...,Laura Aguirre,,,,,,,,,"3539 W GRACE ST\rChicago, Illinois 60618\r(41...."


In [130]:
# Left join to high schools
schools_pr = pd.merge(schools, profiles, how='left', left_on='school_id', right_on='School_ID')

In [133]:
schools_pr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 0 to 141
Data columns (total 96 columns):
avg_daily_attend                           142 non-null object
grad_rate                                  131 non-null object
school_id                                  142 non-null object
school_name                                142 non-null object
type                                       142 non-null object
School_ID                                  142 non-null int64
Legacy_Unit_ID                             142 non-null int64
Finance_ID                                 142 non-null int64
Short_Name                                 142 non-null object
Long_Name                                  142 non-null object
School_Type                                142 non-null object
Primary_Category                           142 non-null object
Is_High_School                             142 non-null object
Is_Middle_School                           142 non-null object
Is_Elementary_Sc

In [143]:
schools_pr.CPS_School_Profile[30]

'http://cps.edu/Schools/Pages/school.aspx?SchoolID=400156'

In [152]:
# the accountability report might not have actual 2016-2017 data, might be goal for 2018 based on actuals from past year?

schools_pr.loc[:,['grad_rate', 'Graduation_Rate_School', 'Graduation_Rate_Mean']].mean()

grad_rate                 73.249618
Graduation_Rate_School    75.175214
Graduation_Rate_Mean      73.500000
dtype: float64

In [150]:
# Pickle data after cleaning
schools_pr.to_pickle('../data/cleaned_data.pkl')

142