# Load in Libraries 

In [5]:
import pandas as pd
import numpy as np

print('Pandas version', pd.__version__)
print('Numpy version', np.__version__)

Pandas version 0.20.3
Numpy version 1.14.0


# Read in Data

In [6]:
profiles = pd.read_csv('../data/Chicago_Public_Schools_-_School_Profile_Information_SY1617.csv', 
                       encoding = "ISO-8859-1")

In [7]:
profiles.head()

Unnamed: 0,School_ID,Legacy_Unit_ID,Finance_ID,Short_Name,Long_Name,School_Type,Primary_Category,Is_High_School,Is_Middle_School,Is_Elementary_School,...,Third_Contact_Name,Fourth_Contact_Title,Fourth_Contact_Name,Fifth_Contact_Title,Fifth_Contact_Name,Sixth_Contact_Title,Sixth_Contact_Name,Seventh_Contact_Title,Seventh_Contact_Name,Location
0,610158,5720,29271,SAYRE,Harriet E Sayre Elementary Language Academy,Neighborhood,ES,N,Y,Y,...,,,,,,,,,,"1850 N NEWLAND AVE\rChicago, Illinois 60707\r(..."
1,610282,7040,26301,MCNAIR,Ronald E McNair Elementary School,Neighborhood,ES,N,Y,Y,...,Tasia White,,,,,,,,,"4820 W WALTON ST\rChicago, Illinois 60651\r(41..."
2,609996,4020,23821,HOLDEN,Charles N Holden Elementary School,Neighborhood,ES,N,Y,Y,...,,,,,,,,,,"1104 W 31ST ST\rChicago, Illinois 60608\r(41.8..."
3,400079,3344,66395,ACERO - ZIZUMBO,Acero Charter Schools - SPC Daniel Zizumbo,Charter,ES,N,Y,Y,...,,,,,,,,,,"4248 W 47TH ST\rChicago, Illinois 60632\r(41.8..."
4,610089,5020,24621,MURPHY,John B Murphy Elementary School,Neighborhood,ES,N,Y,Y,...,Laura Aguirre,,,,,,,,,"3539 W GRACE ST\rChicago, Illinois 60618\r(41...."


In [11]:
profiles.columns

Index(['School_ID', 'Legacy_Unit_ID', 'Finance_ID', 'Short_Name', 'Long_Name',
       'School_Type', 'Primary_Category', 'Is_High_School', 'Is_Middle_School',
       'Is_Elementary_School', 'Is_Pre_School', 'Summary',
       'Administrator_Title', 'Administrator', 'Secondary_Contact_Title',
       'Secondary_Contact', 'Address', 'City', 'State', 'Zip', 'Phone', 'Fax',
       'CPS_School_Profile', 'Website', 'Facebook', 'Twitter', 'Youtube',
       'Pinterest', 'Attendance_Boundaries', 'Grades_Offered_All',
       'Grades_Offered', 'Student_Count_Total', 'Student_Count_Low_Income',
       'Student_Count_Special_Ed', 'Student_Count_English_Learners',
       'Student_Count_Black', 'Student_Count_Hispanic', 'Student_Count_White',
       'Student_Count_Asian', 'Student_Count_Native_American',
       'Student_Count_Other_Ethnicity', 'Student_Count_Asian_Pacific_Islander',
       'Student_Count_Multi', 'Student_Count_Hawaiian_Pacific_Islander',
       'Student_Count_Ethnicity_Not_Available', 

# Clean data

## Remove unnecessary rows

There are rows in here for elementary and middle schools - we don't need those. Additionally, there are some rows with no graduation rate data and we **aren't interested in the observations if there is no associated target**

In [14]:
high_schools = profiles.loc[(profiles['Is_High_School']=='Y') & (profiles['Graduation_Rate_School'].notnull())]

In [24]:
print('Rows:', len(high_schools))
high_schools.head()

Rows: 121


Unnamed: 0,School_ID,Legacy_Unit_ID,Finance_ID,Short_Name,Long_Name,School_Type,Primary_Category,Is_High_School,Is_Middle_School,Is_Elementary_School,...,Third_Contact_Name,Fourth_Contact_Title,Fourth_Contact_Name,Fifth_Contact_Title,Fifth_Contact_Name,Sixth_Contact_Title,Sixth_Contact_Name,Seventh_Contact_Title,Seventh_Contact_Name,Location
10,609764,1890,46421,JUAREZ HS,Benito Juarez Community Academy High School,Neighborhood,HS,Y,N,N,...,Nicholas Aquino,IB Coordinator,Santiago Marquez,,,,,,,"1450 W CERMAK RD\rChicago, Illinois 60608\r(41..."
11,400054,1931,66142,NOBLE - PRITZKER HS,Noble - Pritzker College Prep,Charter,HS,Y,N,N,...,,,,,,,,,,"4131 W CORTLAND ST\rChicago, Illinois 60639\r(..."
12,609726,1500,47051,BROOKS HS,Gwendolyn Brooks College Preparatory Academy HS,Selective enrollment,HS,Y,Y,N,...,Daniel Bauer,,,,,,,,,"250 E 111TH ST\rChicago, Illinois 60628\r(41.6..."
14,400094,8058,63081,EPIC HS,EPIC Academy Charter High School,Charter,HS,Y,N,N,...,Tawanna Patton,Assistant Principal,Josh Miller,Director of Specialized Services,Dan Grady,Director of External Relations and Recruitment,Demetrius Amparan,Alumni Counselor,Bejeray Morrison,"8255 S HOUSTON AVE\rChicago, Illinois 60617\r(..."
21,609755,1810,47101,YOUNG HS,Whitney M Young Magnet High School,Selective enrollment,HS,Y,Y,N,...,Matthew Swanson,Assistant Principal,Melvin Soto,Assistant Principal,Lynn Zalon,,,,,"211 S LAFLIN ST\rChicago, Illinois 60607\r(41...."


## Remove unnecessary columns

We don't need all these columns, let's reduce our dataset to the fields we need to engineer our features

In [25]:
schools = high_schools.loc[:, ['School_ID', 
                             'Short_Name',
                             'Long_Name',
                             'School_Type',
                             'Zip',
                             'Facebook',
                             'Twitter',
                             'Student_Count_Total',
                             'Student_Count_Low_Income',
                             'School_Hours',
                             'Transportation_El',
                             'School_Latitude',
                             'School_Longitude',
                             'Average_ACT_School',
                             'Graduation_Rate_School']]

schools.head()

Unnamed: 0,School_ID,Short_Name,Long_Name,School_Type,Zip,Facebook,Twitter,Student_Count_Total,Student_Count_Low_Income,School_Hours,Transportation_El,School_Latitude,School_Longitude,Average_ACT_School,Graduation_Rate_School
10,609764,JUAREZ HS,Benito Juarez Community Academy High School,Neighborhood,60608,,,1739,1659,7:45 AM - 2:35 PM,"Blue, Pink, Red",41.852667,-87.663732,16.5,80.9
11,400054,NOBLE - PRITZKER HS,Noble - Pritzker College Prep,Charter,60639,https://www.facebook.com/benoblepritzker,http://twitter.com/benoblepritzker,992,936,M - R 8:05AM. - 3:50PM F 8:05- 1:30PM,"Blue, Brown",41.915214,-87.730127,21.9,90.3
12,609726,BROOKS HS,Gwendolyn Brooks College Preparatory Academy HS,Selective enrollment,60628,,https://twitter.com/BrooksPrepCSL,959,692,8:00 AM - 3:30 PM,Orange,41.692763,-87.616353,22.5,93.4
14,400094,EPIC HS,EPIC Academy Charter High School,Charter,60617,https://www.facebook.com/EPICAcademyHighSchool,https://twitter.com/EPIC_Academy,523,491,8:10 AM-3:50 PM,,41.744849,-87.550222,17.5,69.9
21,609755,YOUNG HS,Whitney M Young Magnet High School,Selective enrollment,60607,http://www.facebook.com/wmymhs,http://www.twitter.com/wyhs,2156,891,8:00 AM - 3:15 PM,Blue,41.878603,-87.664233,27.9,96.2


In [27]:
schools.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 121 entries, 10 to 655
Data columns (total 15 columns):
School_ID                   121 non-null int64
Short_Name                  121 non-null object
Long_Name                   121 non-null object
School_Type                 121 non-null object
Zip                         121 non-null int64
Facebook                    65 non-null object
Twitter                     70 non-null object
Student_Count_Total         121 non-null int64
Student_Count_Low_Income    121 non-null int64
School_Hours                118 non-null object
Transportation_El           91 non-null object
School_Latitude             121 non-null float64
School_Longitude            121 non-null float64
Average_ACT_School          117 non-null float64
Graduation_Rate_School      121 non-null float64
dtypes: float64(4), int64(4), object(7)
memory usage: 15.1+ KB


In [35]:
fb_count = schools.Facebook.count()
tw_count = schools.Twitter.count()
el_count = schools.Transportation_El.count()


print('Rate w/ Facebook Link:', round(fb_count/len(schools), 2))
print('Rate w/ Twitter Link:', round(tw_count/len(schools), 2))
print('Rate w/ El Info:', round(el_count/len(schools), 2))

Rate w/ Facebook Link: 0.54
Rate w/ Twitter Link: 0.58
Rate w/ El Info: 0.75


### Pickle Raw School Data

In [36]:
schools.to_pickle('../data/school_data.pkl')