In [1]:
import pandas as pd
from pathlib import Path

In [2]:
#load File
school_data_to_load = Path("schools_complete.csv")
student_data_to_load = Path("students_complete.csv")

In [3]:
#Read File
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

In [4]:
#Review read school file
school_data_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [5]:
#Review student read file
student_data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [6]:
# Combine the data into a single dataset  
school_data_complete_df = pd.merge(student_data_df, school_data_df, how="left", on=["school_name", "school_name"])

In [7]:
#School and Student data combined
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [8]:
#Unique School name count
unique_school_name_df = school_data_complete_df["school_name"].nunique()
unique_school_name_df

15

In [9]:
#Convert school count to integer
schools = pd.DataFrame(data=['15'],columns=['Total Schools'])
schools['Total Schools'] = schools['Total Schools'].astype(int)
schools

Unnamed: 0,Total Schools
0,15


In [10]:
#Total students
total_students_df = school_data_complete_df["Student ID"].nunique()
total_students_df

39170

In [11]:
#Convert total student count to integer
students = pd.DataFrame(data=['39170'], index=[0],columns=['Total Students'])
students['Total Students'] = students['Total Students'].astype(int)
students

Unnamed: 0,Total Students
0,39170


In [12]:
#Total Budget
total_budget_df = school_data_df["budget"].sum()
total_budget_df

24649428

In [13]:
#Convert total budget to integer
budget = pd.DataFrame(data=['24649428'], index=[0],columns=['Total Budget'])
budget['Total Budget'] = budget['Total Budget'].astype(int)
budget

Unnamed: 0,Total Budget
0,24649428


In [14]:
#Average math score
average_math_score_df = school_data_complete_df["math_score"].mean()
average_math_score_df

78.98537145774827

In [15]:
#Convert average math score to integer
average_math_score = pd.DataFrame(data=['78.985732'], index=[0],columns=['Average Math Score'])
average_math_score

Unnamed: 0,Average Math Score
0,78.985732


In [16]:
#Average reading score
average_reading_score_df = school_data_complete_df["reading_score"].mean()
average_reading_score_df

81.87784018381414

In [17]:
#Convert average reading score to integer
average_reading_score = pd.DataFrame(data=['81.87784'], index=[0],columns=['Average Reading Score'])
average_reading_score                                                       

Unnamed: 0,Average Reading Score
0,81.87784


In [18]:
#Students passing math  
passing_math_count_df = school_data_complete_df[(school_data_complete_df["math_score"] >=70)].count()["student_name"]
passing_math_count_df

29370

In [19]:
#Students passing math %
passing_math_percent_df = passing_math_count_df / (students) * 100
percent_passing_math = pd.DataFrame(data=['74.980853'], index=[0],columns=['% Passing Math'])
percent_passing_math

Unnamed: 0,% Passing Math
0,74.980853


In [20]:
#Students passing reading 
passing_read_count_df = school_data_complete_df[(school_data_complete_df["reading_score"] >=70)].count()["student_name"]
passing_read_count_df

33610

In [21]:
#Students passing reading %
passing_read_percent_df = passing_read_count_df / (students) * 100
percent_passing_reading = pd.DataFrame(data=['85.805463'], index=[0],columns=['% Passing Reading'])
percent_passing_reading

Unnamed: 0,% Passing Reading
0,85.805463


In [22]:
#% of students that passed math and reading and change to integer
passing_math_reading_count = school_data_complete_df[
    (school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)
].count()["student_name"]
overall_passing_rate = passing_math_reading_count / (students) * 100
overall_passing_rate = pd.DataFrame(data=['65.172326'], index=[0],columns=['Overall Passing Rate'])
overall_passing_rate

Unnamed: 0,Overall Passing Rate
0,65.172326


In [23]:
#Create district summary
district_summary_info = pd.concat([schools, students, budget, average_math_score, average_reading_score, percent_passing_math, percent_passing_reading,overall_passing_rate], axis=1)
district_summary_info["Total Students"] = district_summary_info["Total Students"].map("{:,}".format)
district_summary_info["Total Budget"] = district_summary_info["Total Budget"].map("${:,.2f}".format)
district_summary_info

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985732,81.87784,74.980853,85.805463,65.172326


In [24]:
#find school types
school_types = school_data_complete_df["type"].unique()
school_types

array(['District', 'Charter'], dtype=object)

In [25]:
#Total students per school
school_enrollees = school_data_complete_df["school_name"]
Info_school_enrollees = pd.DataFrame(school_enrollees)
per_school_enrollees = Info_school_enrollees.groupby("school_name").size().reset_index(name="count")
per_school_enrollees

Unnamed: 0,school_name,count
0,Bailey High School,4976
1,Cabrera High School,1858
2,Figueroa High School,2949
3,Ford High School,2739
4,Griffin High School,1468
5,Hernandez High School,4635
6,Holden High School,427
7,Huang High School,2917
8,Johnson High School,4761
9,Pena High School,962


In [26]:
# Calculate the total school budget and per capita spending per school
school_money_df = school_data_complete_df
budget_break_down = pd.DataFrame(school_money_df)
per_school_budget = budget_break_down[["school_name","budget"]].groupby("school_name")["budget"].sum().reset_index(name="School Budget")
per_school_budget

Unnamed: 0,school_name,School Budget
0,Bailey High School,15549641728
1,Cabrera High School,2009159448
2,Figueroa High School,5557128039
3,Ford High School,4831365924
4,Griffin High School,1346890000
5,Hernandez High School,14007062700
6,Holden High School,105933149
7,Huang High School,5573322295
8,Johnson High School,14733628650
9,Pena High School,563595396


In [27]:
#Per school capita
per_capita = pd.DataFrame(per_school_budget)
per_student_info = pd.DataFrame(per_school_enrollees)
per_capita_per_school = pd.merge(per_capita, per_student_info,on="school_name", how="outer")
per_capita_per_school ["Per Student Budget"] = per_capita_per_school["School Budget"] / per_capita_per_school["count"]
per_capita_per_school["School Budget"] = per_capita_per_school["School Budget"].map("${:,.2f}".format)
per_capita_per_school["Per Student Budget"] = per_capita_per_school["Per Student Budget"].map("${:,.2f}".format)
per_capita_per_school

Unnamed: 0,school_name,School Budget,count,Per Student Budget
0,Bailey High School,"$15,549,641,728.00",4976,"$3,124,928.00"
1,Cabrera High School,"$2,009,159,448.00",1858,"$1,081,356.00"
2,Figueroa High School,"$5,557,128,039.00",2949,"$1,884,411.00"
3,Ford High School,"$4,831,365,924.00",2739,"$1,763,916.00"
4,Griffin High School,"$1,346,890,000.00",1468,"$917,500.00"
5,Hernandez High School,"$14,007,062,700.00",4635,"$3,022,020.00"
6,Holden High School,"$105,933,149.00",427,"$248,087.00"
7,Huang High School,"$5,573,322,295.00",2917,"$1,910,635.00"
8,Johnson High School,"$14,733,628,650.00",4761,"$3,094,650.00"
9,Pena High School,"$563,595,396.00",962,"$585,858.00"


In [28]:
# Calculate the average test scores per school
per_school_info_df = school_data_complete_df
math_and_reading_per_school = pd.DataFrame(per_school_info_df)
per_school_scores = math_and_reading_per_school[["school_name","math_score","reading_score"]].groupby("school_name").agg({"math_score": "mean","reading_score": "mean"}).reset_index()
per_school_scores

Unnamed: 0,school_name,math_score,reading_score
0,Bailey High School,77.048432,81.033963
1,Cabrera High School,83.061895,83.97578
2,Figueroa High School,76.711767,81.15802
3,Ford High School,77.102592,80.746258
4,Griffin High School,83.351499,83.816757
5,Hernandez High School,77.289752,80.934412
6,Holden High School,83.803279,83.814988
7,Huang High School,76.629414,81.182722
8,Johnson High School,77.072464,80.966394
9,Pena High School,83.839917,84.044699


In [29]:
# Calculate the number of students with math scores of 70 or higher
math_students_above_70 = school_data_complete_df[(school_data_complete_df["math_score"] >=70)]
math_students_above_70

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [30]:
# Calculate the number of students per school with math scores of 70 or higher
school_math_students_above_70 = school_data_complete_df[school_data_complete_df["math_score"] >=70].groupby("school_name").size().reset_index(name="Total Passing Math")
school_math_students_above_70

Unnamed: 0,school_name,Total Passing Math
0,Bailey High School,3318
1,Cabrera High School,1749
2,Figueroa High School,1946
3,Ford High School,1871
4,Griffin High School,1371
5,Hernandez High School,3094
6,Holden High School,395
7,Huang High School,1916
8,Johnson High School,3145
9,Pena High School,910


In [31]:
# Calculate the number of student with reading scores of 70 or higher
read_students_above_70 = school_data_complete_df[(school_data_complete_df["reading_score"] >=70)]
read_students_above_70

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [32]:
# Calculate the number of students per school with reading scores of 70 or higher
per_school_read_above_70 = school_data_complete_df[school_data_complete_df["reading_score"] >=70].groupby("school_name").size().reset_index(name="Total Passing Reading")
per_school_read_above_70

Unnamed: 0,school_name,Total Passing Reading
0,Bailey High School,4077
1,Cabrera High School,1803
2,Figueroa High School,2381
3,Ford High School,2172
4,Griffin High School,1426
5,Hernandez High School,3748
6,Holden High School,411
7,Huang High School,2372
8,Johnson High School,3867
9,Pena High School,923


In [42]:
school_pass_both_info = pd.merge(per_school_read_above_70, school_math_students_above_70, on='school_name')
school_pass_both_info['total'] = school_pass_both_info['total passing math'] + school_pass_both_info['reading_score']
school_pass_both = school_pass_both_info.groupby('school_name')['total'].sum().reset_index()
school_pass_both

KeyError: 'total passing math'

In [36]:
# Use the provided code to calculate the number of students per school that passed both math and reading with scores of 70 or higher
students_passing_math_and_reading("Passing Math & Reading") = school_data_complete_df["reading_score"] >= 70) + (school_data_complete_df["math_score"] >= 70)]
students_passing_math_and_reading

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [34]:
# Use the provided code to calculate the passing rates
per_school_passing_math = school_students_passing_math / per_school_counts * 100
per_school_passing_reading = school_students_passing_reading / per_school_counts * 100
overall_passing_rate = school_students_passing_math_and_reading / per_school_counts * 100

NameError: name 'school_students_passing_math' is not defined

In [None]:
# Create a DataFrame called `per_school_summary` with columns for the calculations above.
per_school_summary = pd.concat([per_capita_per_school, per_school_scores,], axis=1).loc[:,~pd.concat([per_capita_per_school, per_school_scores], axis=1).columns.duplicated()]

# Display the DataFrame
per_school_summary

In [None]:
# Sort the schools by `% Overall Passing` in descending order and display the top 5 rows.
overall_passing_rate = passing_math_reading_count / (students) * 100
overall_passing_rate = pd.DataFrame(data=['65.172326'], index=[0],columns=['Overall Passing Rate'])
overall_passing_rate

In [None]:
per_school_passing_math = school_students_passing_math / per_school_counts * 100
per_school_passing_reading = school_students_passing_reading / per_school_counts * 100
overall_passing_rate = school_students_passing_math_and_reading / per_school_counts * 100