In [1]:
#Module 4 Challenge - Pandas 

#imports
import pandas as pd
from pathlib import Path
import warnings

#Hide warnings after first review
warnings.filterwarnings(action='once')

In [2]:
#set file paths for data sources
schools = Path("Resources/schools_complete.csv")
students = Path("Resources/students_complete.csv")

In [3]:
#open data files into dataframes
schools_df = pd.read_csv(schools)
schools_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [4]:
#open data files into dataframes
students_df = pd.read_csv(students)
students_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [79]:
#merge data into one dataframe
district_df = students_df.merge(schools_df, on="school_name", how='left')
district_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [86]:
#District Summary - Determine data and add to dataframe
ds_series = []
ds_columns = []
#Number of unique schools
district_counts = district_df.groupby('type')['school_name'].nunique()
ds_series.append(district_counts)
ds_columns.append("School Count")
#Number of students
total_students = district_df.drop_duplicates(subset="school_name", keep="first").groupby('type')['size'].sum()
ds_series.append(total_students)
ds_columns.append("Students")
#Total budget
total_budget = district_df.drop_duplicates(subset="school_name", keep="first").groupby('type')['budget'].sum()
ds_series.append(total_budget.map('${:,.2f}'.format))
ds_columns.append("Total Budget")
#Per student budget
student_budget = total_budget.divide(total_students)
ds_series.append(student_budget.map('${:,.2f}'.format))
ds_columns.append("Per Student $")
#Average math score
math_avg = district_df.groupby('type')['math_score'].mean()
ds_series.append(math_avg.map('{:,.2f}'.format))
ds_columns.append("Math Avg")
#Average reading score
read_avg = district_df.groupby('type')['reading_score'].mean()
ds_series.append(read_avg.map('{:,.2f}'.format))
ds_columns.append("Reading Avg")
#Passing math %
math_pass = district_df[district_df['math_score'] >= 70].groupby('type')['math_score'].count()
ds_series.append(math_pass.divide(total_students).astype(float).map("{:.2%}".format))
ds_columns.append("Math Pass %")
#Passing reading %
read_pass = district_df[district_df['reading_score'] >= 70].groupby('type')['reading_score'].count()
ds_series.append(read_pass.divide(total_students).astype(float).map("{:.2%}".format))
ds_columns.append("Reading Pass %")
#total passing 
total_pass = district_df[(district_df['reading_score'] >= 70) & (district_df['math_score'] >= 70)].groupby('type')['student_name'].count()
ds_series.append(total_pass.divide(total_students).astype(float).map("{:.2%}".format))
ds_columns.append("Pass All %")

district_summary = pd.concat(ds_series, keys=ds_columns, axis=1)
district_summary.head()

Unnamed: 0_level_0,School Count,Students,Total Budget,Per Student $,Math Avg,Reading Avg,Math Pass %,Reading Pass %,Pass All %
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Charter,8,12194,"$7,301,505.00",$598.78,83.41,83.9,93.70%,96.65%,90.56%
District,7,26976,"$17,347,923.00",$643.09,76.99,80.96,66.52%,80.91%,53.70%


In [91]:
#School summary - Determine data and add to dateframe
ss_series = []
ss_columns = []

#Set base dataframe:
school_names = district_df.groupby('school_name')['type'].unique()
ss_series.append(school_names)
ss_columns.append("School")
#Number of students
total_students = district_df.groupby('school_name')['student_name'].count()
ss_series.append(total_students)
ss_columns.append("Students")
#Total budget
total_budget = district_df.drop_duplicates(subset="school_name", keep="first").groupby('school_name')['budget'].sum()
ss_series.append(total_budget.map('${:,.2f}'.format))
ss_columns.append("Total Budget")
#Per student budget
student_budget = total_budget.divide(total_students)
ss_series.append(student_budget.map('${:,.2f}'.format))
ss_columns.append("Per Student $")
#Average math score
math_avg = district_df.groupby('school_name')['math_score'].mean()
ss_series.append(math_avg.map('{:,.2f}'.format))
ss_columns.append("Math Avg")
#Average reading score
read_avg = district_df.groupby('school_name')['reading_score'].mean()
ss_series.append(read_avg.map('{:,.2f}'.format))
ss_columns.append("Reading Avg")
#Passing math %
math_pass = district_df[district_df['math_score'] >= 70].groupby('school_name')['math_score'].count()
ss_series.append(math_pass.divide(total_students).astype(float).map("{:.2%}".format))
ss_columns.append("Math Pass %")
#Passing reading %
read_pass = district_df[district_df['reading_score'] >= 70].groupby('school_name')['reading_score'].count()
ss_series.append(read_pass.divide(total_students).astype(float).map("{:.2%}".format))
ss_columns.append("Reading Pass %")
#total passing 
total_pass = district_df[(district_df['reading_score'] >= 70) & (district_df['math_score'] >= 70)].groupby('school_name')['student_name'].count()
ss_series.append(total_pass.divide(total_students).astype(float).map("{:.2%}".format))
ss_columns.append("Pass All %")

school_summary = pd.concat(ss_series, keys=ss_columns, axis=1)
school_summary.head()

Unnamed: 0_level_0,School,Students,Total Budget,Per Student $,Math Avg,Reading Avg,Math Pass %,Reading Pass %,Pass All %
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,[District],4976,"$3,124,928.00",$628.00,77.05,81.03,66.68%,81.93%,54.64%
Cabrera High School,[Charter],1858,"$1,081,356.00",$582.00,83.06,83.98,94.13%,97.04%,91.33%
Figueroa High School,[District],2949,"$1,884,411.00",$639.00,76.71,81.16,65.99%,80.74%,53.20%
Ford High School,[District],2739,"$1,763,916.00",$644.00,77.1,80.75,68.31%,79.30%,54.29%
Griffin High School,[Charter],1468,"$917,500.00",$625.00,83.35,83.82,93.39%,97.14%,90.60%


In [92]:
#Top Schools - sort schools by overall passing % descending and show top 5
school_summary = school_summary.sort_values(by=['Pass All %'], ascending=False)
school_summary.head()

Unnamed: 0_level_0,School,Students,Total Budget,Per Student $,Math Avg,Reading Avg,Math Pass %,Reading Pass %,Pass All %
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,[Charter],1858,"$1,081,356.00",$582.00,83.06,83.98,94.13%,97.04%,91.33%
Thomas High School,[Charter],1635,"$1,043,130.00",$638.00,83.42,83.85,93.27%,97.31%,90.95%
Griffin High School,[Charter],1468,"$917,500.00",$625.00,83.35,83.82,93.39%,97.14%,90.60%
Wilson High School,[Charter],2283,"$1,319,574.00",$578.00,83.27,83.99,93.87%,96.54%,90.58%
Pena High School,[Charter],962,"$585,858.00",$609.00,83.84,84.04,94.59%,95.95%,90.54%


In [93]:
#Lowest Schools - sort schools ascending and show top 5
school_summary = school_summary.sort_values(by=['Pass All %'], ascending=True)
school_summary.head()

Unnamed: 0_level_0,School,Students,Total Budget,Per Student $,Math Avg,Reading Avg,Math Pass %,Reading Pass %,Pass All %
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,[District],3999,"$2,547,363.00",$637.00,76.84,80.74,66.37%,80.22%,52.99%
Figueroa High School,[District],2949,"$1,884,411.00",$639.00,76.71,81.16,65.99%,80.74%,53.20%
Huang High School,[District],2917,"$1,910,635.00",$655.00,76.63,81.18,65.68%,81.32%,53.51%
Hernandez High School,[District],4635,"$3,022,020.00",$652.00,77.29,80.93,66.75%,80.86%,53.53%
Johnson High School,[District],4761,"$3,094,650.00",$650.00,77.07,80.97,66.06%,81.22%,53.54%


In [94]:
#Math scores by grade level


In [None]:
#Reading scores by grade level


In [None]:
#Scores by school spending


In [None]:
#Scores by school size


In [None]:
#Scores by school type
