In [2]:
# Dependencies and Setup
import pandas as pd
import sys


# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)
# 
school_data_complete = pd.merge (student_data, school_data,how="left",on=["school_name", "school_name"])

# Combine the data into a single datasetdata, how="left", on=["school_name", "school_name"])
school_data_complete.head()


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [3]:
# Calculate the total number of schools
Total_schools = len(school_data_complete["school_name"].unique())

# Calculate the total number of students
Total_students = school_data_complete["Student ID"].count()

# Calculate the total budget
Total_budget =school_data_complete["budget"].sum()

# Calculate Average scores and passing rate

# Calculate the average math score 
Average_math_score = school_data_complete.mean()["math_score"]

# Calculate the average reading score
Average_reading_score = school_data_complete["reading_score"].mean()

# Calculate the overall passing rate (overall average score), i.e. (avg. math score + avg. reading score)/2
Percentage_passing_math = (school_data_complete[school_data_complete["math_score"]>=70]["student_name"].count()/Total_students)*100

#Calculate the percentage of students with a passing math score (70 or greater)
Percentage_passing_reading = (school_data_complete[school_data_complete["reading_score"]>=70]["student_name"].count()/Total_students)*100
Passing_math_reading_count = school_data_complete[(school_data_complete["math_score"]>=70)
                                                &(school_data_complete["reading_score"]>=70)].count()["student_name"]
#Calculate the percentage of students with a passing reading score (70 or greater)
Overall_passing_rate = Passing_math_reading_count / float (Total_students)*100


# Create a dataframe to hold the above results
District_summary = pd.DataFrame({"Total schools": [Total_schools], "Total students": [Total_students], "Total budget": [Total_budget], "Average Math score": [Average_math_score], "Average Reading Score": [Average_reading_score], "% of students Passing Math": [Percentage_passing_math], "% of students Passing Reading": [Percentage_passing_reading], "Overall Passing Score %": [Overall_passing_rate]})

# Optional: give the displayed data cleaner formatting
District_summary

Unnamed: 0,Total schools,Total students,Total budget,Average Math score,Average Reading Score,% of students Passing Math,% of students Passing Reading,Overall Passing Score %
0,15,39170,82932329558,78.985371,81.87784,74.980853,85.805463,65.172326


In [4]:
# Calculate by school
By_school = school_data_complete.set_index('school_name').groupby(['school_name'])

# school types
School_types = school_data.set_index('school_name')['type']

# total students by school
Student_per_school = By_school['Student ID'].count()

# school budget
School_budget = school_data.set_index('school_name')['budget']

#per student budget
Student_budget = school_data.set_index('school_name')['budget']/school_data.set_index('school_name')['size']

#avg scores by school
Average_math_score = By_school['math_score'].mean()
Average_reading_score = By_school['reading_score'].mean()

# % passing scores
Passed_math = school_data_complete[school_data_complete['math_score'] >= 70].groupby('school_name')['Student ID'].count()/Student_per_school 
Passed_reading = school_data_complete[school_data_complete['reading_score'] >= 70].groupby('school_name')['Student ID'].count()/Student_per_school
Overall_passing_rate = school_data_complete[(school_data_complete['reading_score'] >= 70) & (school_data_complete['math_score'] >= 70)].groupby('school_name')['Student ID'].count()/Student_per_school 

Schools_summary = pd.DataFrame({
    "School Type": School_types,
    "Total Students": Student_per_school,
    "Per Student Budget": Student_budget,
    "Total School Budget": School_budget,
    "Average Math Score": Average_math_score,
    "Average Reading Score": Average_reading_score,
    '% Passing Math': Passed_math,
    '% Passing Reading': Passed_reading,
    "Overall Passing Rate": Overall_passing_rate
})


#formatting
Schools_summary_final = Schools_summary.style.format({'Total Students': '{:,}', 
                          "Total School Budget": "${:,}", 
                          "Per Student Budget": "${:.0f}",
                          'Average Math Score': "{:.1f}", 
                          'Average Reading Score': "{:.1f}", 
                          "% Passing Math": "{:.1%}", 
                          "% Passing Reading": "{:.1%}", 
                          "Overall Passing Rate": "{:.1%}"})

Schools_summary_final


Unnamed: 0,School Type,Total Students,Per Student Budget,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Bailey High School,District,4976,$628,"$3,124,928",77.0,81.0,66.7%,81.9%,54.6%
Cabrera High School,Charter,1858,$582,"$1,081,356",83.1,84.0,94.1%,97.0%,91.3%
Figueroa High School,District,2949,$639,"$1,884,411",76.7,81.2,66.0%,80.7%,53.2%
Ford High School,District,2739,$644,"$1,763,916",77.1,80.7,68.3%,79.3%,54.3%
Griffin High School,Charter,1468,$625,"$917,500",83.4,83.8,93.4%,97.1%,90.6%
Hernandez High School,District,4635,$652,"$3,022,020",77.3,80.9,66.8%,80.9%,53.5%
Holden High School,Charter,427,$581,"$248,087",83.8,83.8,92.5%,96.3%,89.2%
Huang High School,District,2917,$655,"$1,910,635",76.6,81.2,65.7%,81.3%,53.5%
Johnson High School,District,4761,$650,"$3,094,650",77.1,81.0,66.1%,81.2%,53.5%
Pena High School,Charter,962,$609,"$585,858",83.8,84.0,94.6%,95.9%,90.5%


In [19]:
# Sort and display the top-five performing schools
Top_Five_Schools = Schools_summary.sort_values(by="Overall Passing Rate", ascending=False)
Top_Five_Schools.round(decimals=2)

Unnamed: 0,School Type,Total Students,Per Student Budget,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate,Spending Ranges (Per Student),School Size
Cabrera High School,Charter,1858,582.0,1081356,83.06,83.98,0.94,0.97,0.91,<$585,Medium (1000-2000)
Thomas High School,Charter,1635,638.0,1043130,83.42,83.85,0.93,0.97,0.91,$615-645,Medium (1000-2000)
Griffin High School,Charter,1468,625.0,917500,83.35,83.82,0.93,0.97,0.91,$615-645,Medium (1000-2000)
Wilson High School,Charter,2283,578.0,1319574,83.27,83.99,0.94,0.97,0.91,<$585,Large (2000-5000)
Pena High School,Charter,962,609.0,585858,83.84,84.04,0.95,0.96,0.91,$585-615,Small (<1000)
Wright High School,Charter,1800,583.0,1049400,83.68,83.96,0.93,0.97,0.9,<$585,Medium (1000-2000)
Shelton High School,Charter,1761,600.0,1056600,83.36,83.73,0.94,0.96,0.9,$585-615,Medium (1000-2000)
Holden High School,Charter,427,581.0,248087,83.8,83.81,0.93,0.96,0.89,<$585,Small (<1000)
Bailey High School,District,4976,628.0,3124928,77.05,81.03,0.67,0.82,0.55,$615-645,Large (2000-5000)
Ford High School,District,2739,644.0,1763916,77.1,80.75,0.68,0.79,0.54,$615-645,Large (2000-5000)


In [16]:
# Sort and display the five worst_performing schools
Worst_Five_Schools = Schools_summary.sort_values(by="Overall Passing Rate", ascending=True)
Worst_Five_Schools.round(decimals=2)

Unnamed: 0,School Type,Total Students,Per Student Budget,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate,Spending Ranges (Per Student),School Size
Rodriguez High School,District,3999,637.0,2547363,76.84,80.74,0.66,0.8,0.53,$615-645,Large (2000-5000)
Figueroa High School,District,2949,639.0,1884411,76.71,81.16,0.66,0.81,0.53,$615-645,Large (2000-5000)
Huang High School,District,2917,655.0,1910635,76.63,81.18,0.66,0.81,0.54,$645-675,Large (2000-5000)
Hernandez High School,District,4635,652.0,3022020,77.29,80.93,0.67,0.81,0.54,$645-675,Large (2000-5000)
Johnson High School,District,4761,650.0,3094650,77.07,80.97,0.66,0.81,0.54,$645-675,Large (2000-5000)
Ford High School,District,2739,644.0,1763916,77.1,80.75,0.68,0.79,0.54,$615-645,Large (2000-5000)
Bailey High School,District,4976,628.0,3124928,77.05,81.03,0.67,0.82,0.55,$615-645,Large (2000-5000)
Holden High School,Charter,427,581.0,248087,83.8,83.81,0.93,0.96,0.89,<$585,Small (<1000)
Shelton High School,Charter,1761,600.0,1056600,83.36,83.73,0.94,0.96,0.9,$585-615,Medium (1000-2000)
Wright High School,Charter,1800,583.0,1049400,83.68,83.96,0.93,0.97,0.9,<$585,Medium (1000-2000)


In [20]:
# Create a series for each grade and group by school


ninth_Grade_math_grouped = school_data_complete[school_data_complete["grade"]=="9th"].groupby("school_name")["math_score"].mean()
tenth_Grade_math_grouped = school_data_complete[school_data_complete["grade"]=="10th"].groupby("school_name")["math_score"].mean()
eleventh_Grade_math_grouped = school_data_complete[school_data_complete["grade"]=="11th"].groupby("school_name")["math_score"].mean()
twelvth_Grade_math_grouped = school_data_complete[school_data_complete["grade"]=="12th"].groupby("school_name")["math_score"].mean()

# Create a Dataframe to show results
Math_Scores_By_Grade = pd.DataFrame({"9th Grade": ninth_Grade_math_grouped, "10th Grade": tenth_Grade_math_grouped, "11th Grade": eleventh_Grade_math_grouped, "12th Grade": twelvth_Grade_math_grouped})
Math_Scores_By_Grade.round(decimals=2)


Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86
Pena High School,83.63,83.37,84.33,84.12


In [21]:
# Create a series for each grade and group by school


ninth_Grade_reading_grouped = school_data_complete[school_data_complete["grade"]=="9th"].groupby("school_name")["reading_score"].mean()
tenth_Grade_reading_grouped = school_data_complete[school_data_complete["grade"]=="10th"].groupby("school_name")["reading_score"].mean()
eleventh_Grade_reading_grouped = school_data_complete[school_data_complete["grade"]=="11th"].groupby("school_name")["reading_score"].mean()
twelvth_Grade_reading_grouped = school_data_complete[school_data_complete["grade"]=="12th"].groupby("school_name")["reading_score"].mean()

# Create a Dataframe to show results
Reading_Scores_By_Grade = pd.DataFrame({"9th Grade": ninth_Grade_reading_grouped, "10th Grade": tenth_Grade_reading_grouped, "11th Grade": eleventh_Grade_reading_grouped, "12th Grade": twelvth_Grade_reading_grouped})
Reading_Scores_By_Grade.round(decimals=2)


Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.3,80.91,80.95,80.91
Cabrera High School,83.68,84.25,83.79,84.29
Figueroa High School,81.2,81.41,80.64,81.38
Ford High School,80.63,81.26,80.4,80.66
Griffin High School,83.37,83.71,84.29,84.01
Hernandez High School,80.87,80.66,81.4,80.86
Holden High School,83.68,83.32,83.82,84.7
Huang High School,81.29,81.51,81.42,80.31
Johnson High School,81.26,80.77,80.62,81.23
Pena High School,83.81,83.61,84.34,84.59


In [28]:
# Create a table that breaks down school performances based on average Spending Ranges (Per Student)
# Create bins
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]

                                           

In [30]:
Scores_spending = Schools_summary

Scores_spending["Spending Ranges (Per Student)"] = pd.cut(Schools_summary["Per Student Budget"], spending_bins, labels=group_names)

Scores_spending = Scores_spending.groupby("Spending Ranges (Per Student)").mean()

Scores_spending.round(decimals=2)

Unnamed: 0_level_0,Total Students,Per Student Budget,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
<$585,1592.0,581.0,924604.25,83.46,83.93,0.93,0.97,0.9
$585-615,1361.5,604.5,821229.0,83.6,83.89,0.94,0.96,0.9
$615-645,2961.0,635.17,1880208.0,79.08,81.89,0.76,0.86,0.66
$645-675,4104.33,652.33,2675768.33,77.0,81.03,0.66,0.81,0.54


In [32]:
# Scores by schools size
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]


In [33]:

Scores_size = Schools_summary

Scores_size["School Size"] = pd.cut(Schools_summary["Total Students"], size_bins, labels=group_names)

Scores_size = Scores_size.groupby("School Size").mean()

Scores_size.round(decimals=2)

Unnamed: 0_level_0,Total Students,Per Student Budget,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Small (<1000),694.5,595.0,416972.5,83.82,83.93,0.94,0.96,0.9
Medium (1000-2000),1704.4,605.6,1029597.2,83.37,83.86,0.94,0.97,0.91
Large (2000-5000),3657.38,635.38,2333437.12,77.75,81.34,0.7,0.83,0.58


In [34]:
# Create a new dataframe by school type
Scores_school_type = Schools_summary.loc[:, ['School Type','Average Math Score',
                                  'Average Reading Score','% Passing Math',
                                  '% Passing Reading','Overall Passing Rate',]]
# Create a group based on school type
Scores_school_type = Scores_school_type.groupby('School Type').mean()
Scores_school_type.round(decimals=2)

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.47,83.9,0.94,0.97,0.9
District,76.96,80.97,0.67,0.81,0.54
