In [1]:
# Add the Pandas dependency.
import pandas as pd
import os

# Files to load
school_data_to_load = os.path.join("Resources","schools_complete.csv")
student_data_to_load = os.path.join("Resources","students_complete.csv")

school_data_df=pd.read_csv(school_data_to_load)

student_data_df=pd.read_csv(student_data_to_load)


In [2]:
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [3]:
# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

In [4]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df,school_data_df,on = ["school_name","school_name"])

In [5]:
# Calculate the total number of schools
school_uniques = school_data_complete_df["school_name"].unique()
school_count = len(school_uniques)

In [6]:
# Calculate the total budget.
total_budget = school_data_df["budget"].sum()

In [7]:
# Calculate the average math and reading scores.
average_math_score = school_data_complete_df["math_score"].mean()
average_reading_score = school_data_complete_df["reading_score"].mean()

In [8]:
# Determining the passing grade
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70

In [9]:
# Get all the students who are passing math in a new DataFrame
passing_math = school_data_complete_df[school_data_complete_df["math_score"]>= 70]
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"]>= 70]
passing_math_count = len(passing_math)
passing_reading_count = len(passing_reading)

In [10]:
# Get the total number of students
student_count = school_data_complete_df["student_name"].count()

In [11]:
# Calculate the percent that passed math and reading
passing_math_percentage = passing_math_count / float(student_count) * 100
passing_reading_percentage = passing_reading_count / float(student_count) * 100

In [12]:
# Calculate the students who passed both math and reading.
passed_math_reading = school_data_complete_df[(school_data_complete_df["math_score"]>= 70) & (school_data_complete_df["reading_score"]>= 70)]

In [13]:
# Calculate the number of students who passed both math and reading.
overall_passing_math_reading_count = passed_math_reading["student_name"].count()

In [14]:
# Calculate the overall passing percentage.
overall_passing_percentage =  (overall_passing_math_reading_count / student_count) * 100

In [15]:
# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame([
    {"Total Schools": school_count,
     "Total Students": student_count,
     "Total Budget": total_budget,
     "Average Reading Score": average_reading_score,
     "Average Math Score": average_math_score,
     "% Passing Reading": passing_reading_percentage,
     "% Passing Math": passing_math_percentage,
     "% Overall Passing": overall_passing_percentage
    }])

In [16]:
# Define a function that calculates the percentage of students that passed both 
# math and reading and prints the passing percentage to the output when the
# function is called.
def passing_math_percent(pass_math_count, student_count):
    return pass_math_count / float(student_count) * 100

In [17]:
# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$".
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,}".format)

In [18]:
# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)

In [19]:
# Format the columns.
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)

In [20]:
# Format the columns.
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)

In [21]:
# Format the columns.
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

In [22]:
# Reorder the columns in the order you want them to appear.
new_column_order = ["Total Schools", "Total Students", "Total Budget","Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing"]

In [23]:
# Assign district summary df the new column order.
district_summary_df = district_summary_df[new_column_order]

In [24]:
# Determine the school type.
per_school_types = school_data_df.set_index(["school_name"])["type"]

In [25]:
# Calculate the total student count.
per_school_counts = school_data_df.set_index(["school_name"])["size"]

In [26]:
# Calculate the total school budget.
per_school_budget = school_data_df.set_index(["school_name"])["budget"]

In [27]:
# Calculate the per capita spending.
per_school_capita = per_school_budget / per_school_counts

In [28]:
# Calculate the average test scores.
per_school_math = school_data_complete_df.groupby(["school_name"]).mean()["math_score"]
per_school_reading = school_data_complete_df.groupby(["school_name"]).mean()["reading_score"]

In [29]:
# Calculate the passing scores by creating a filtered DataFrame.
per_school_passing_math = school_data_complete_df[school_data_complete_df["math_score"]>=70]
per_school_passing_reading = school_data_complete_df[school_data_complete_df["reading_score"]>=70]

In [30]:
# Calculate the number of students passing math and passing reading by school.
per_school_passing_math = per_school_passing_math.groupby(["school_name"]).count()["student_name"]
per_school_passing_reading = per_school_passing_reading.groupby(["school_name"]).count()["student_name"]

In [31]:
# Calculate the percentage of passing math and reading scores per school.
per_school_passing_math = per_school_passing_math / per_school_counts *100
per_school_passing_reading = per_school_passing_reading / per_school_counts *100

In [32]:
# Calculate the students who passed both math and reading.
per_passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"]>=70) & (school_data_complete_df["reading_score"]>=70)]

In [33]:
# Calculate the number of students who passed both math and reading.
per_passing_math_reading = per_passing_math_reading.groupby(["school_name"]).count()["student_name"]

In [34]:
# Calculate the overall passing percentage.
per_overall_passing_percentage = per_passing_math_reading / per_school_counts * 100

In [35]:
# Add all the values to a new DataFrame
per_school_summary_df = pd.DataFrame(
    {"School Type":per_school_types,
    "Total Students":per_school_counts,
    "Total School Budget":per_school_budget,
    "Per Student Budget":per_school_capita,
    "Average Math Score":per_school_passing_math,
    "Average Reading Score":per_school_passing_reading,
    "% Passing Math":per_school_passing_math,
    "% Passing Reading":per_school_passing_reading,
    "% Overall Passing":per_overall_passing_percentage})

In [36]:
# Format the Total School Budget and the Per Student Budget columns.
per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].map("${:,.2f}".format)

In [37]:
# Sort and show top five schools.
top_schools = per_school_summary_df.sort_values(["% Overall Passing"],ascending = False)

In [38]:
# Sort and show bottom five schools
bottom_schools = per_school_summary_df.sort_values(["% Overall Passing"])

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,66.366592,80.220055,66.366592,80.220055,52.988247
Figueroa High School,District,2949,"$1,884,411.00",$639.00,65.988471,80.739234,65.988471,80.739234,53.204476
Huang High School,District,2917,"$1,910,635.00",$655.00,65.683922,81.316421,65.683922,81.316421,53.513884
Hernandez High School,District,4635,"$3,022,020.00",$652.00,66.752967,80.862999,66.752967,80.862999,53.527508
Johnson High School,District,4761,"$3,094,650.00",$650.00,66.057551,81.222432,66.057551,81.222432,53.539172


In [44]:
# Create a grade level DataFrames.
ninth_graders = school_data_complete_df[(school_data_complete_df["grade"]=="9th")]
tenth_graders = school_data_complete_df[(school_data_complete_df["grade"]=="10th")]
eleventh_graders = school_data_complete_df[(school_data_complete_df["grade"]=="11th")]
twelfth_graders = school_data_complete_df[(school_data_complete_df["grade"]=="12th")]

In [45]:
# Group each school Series by the school name for the average math score.
ninth_graders_math_scores = ninth_graders.groupby("school_name").mean()["math_score"]
tenth_graders_math_scores = tenth_graders.groupby("school_name").mean()["math_score"]
eleventh_graders_math_scores = eleventh_graders.groupby("school_name").mean()["math_score"]
twelfth_graders_math_scores = twelfth_graders.groupby("school_name").mean()["math_score"]

In [48]:
# Group each school Series by the school name for the average reading score.
ninth_graders_reading_scores = ninth_graders.groupby("school_name").mean()["reading_score"]
tenth_graders_reading_scores = tenth_graders.groupby("school_name").mean()["reading_score"]
eleventh_graders_reading_scores = eleventh_graders.groupby("school_name").mean()["reading_score"]
twelfth_graders_reading_scores = twelfth_graders.groupby("school_name").mean()["reading_score"]

In [60]:
# Combine each Series for average math scores by school into single DataFrame.
math_scores_by_grade = pd.DataFrame({
    "9th":ninth_graders_math_scores,
    "10th":tenth_graders_math_scores,
    "11th":eleventh_graders_math_scores,
    "12th":twelfth_graders_math_scores})

In [61]:
# Combine each Series for average reading scores by school into single DataFrame.
reading_scores_by_grade = pd.DataFrame({
    "9th":ninth_graders_reading_scores,
    "10th":tenth_graders_reading_scores,
    "11th":eleventh_graders_reading_scores,
    "12th":twelfth_graders_reading_scores})

In [62]:
# Format the grade-level averages of math-scores to one decimal place
math_scores_by_grade["9th"] = math_scores_by_grade["9th"].map("{:.1f}".format)
math_scores_by_grade["10th"] = math_scores_by_grade["10th"].map("{:.1f}".format)
math_scores_by_grade["11th"] = math_scores_by_grade["11th"].map("{:.1f}".format)
math_scores_by_grade["12th"] = math_scores_by_grade["12th"].map("{:.1f}".format)

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.1,77.0,77.5,76.5
Cabrera High School,83.1,83.2,82.8,83.3
Figueroa High School,76.4,76.5,76.9,77.2
Ford High School,77.4,77.7,76.9,76.2
Griffin High School,82.0,84.2,83.8,83.4
Hernandez High School,77.4,77.3,77.1,77.2
Holden High School,83.8,83.4,85.0,82.9
Huang High School,77.0,75.9,76.4,77.2
Johnson High School,77.2,76.7,77.5,76.9
Pena High School,83.6,83.4,84.3,84.1


In [63]:
# Make sure the columns are in the correct order.
math_scores_by_grade = math_scores_by_grade[["9th","10th","11th","12th"]]

In [64]:
# Remove the index name.
math_scores_by_grade.index.name = None

In [66]:
# Format the grade-level averages of reading-scores to one decimal place
reading_scores_by_grade["9th"] = reading_scores_by_grade["9th"].map("{:.1f}".format)
reading_scores_by_grade["10th"] = reading_scores_by_grade["10th"].map("{:.1f}".format)
reading_scores_by_grade["11th"] = reading_scores_by_grade["11th"].map("{:.1f}".format)
reading_scores_by_grade["12th"] = reading_scores_by_grade["12th"].map("{:.1f}".format)

In [67]:
# Make sure the columns are in the correct order.
reading_scores_by_grade = reading_scores_by_grade[["9th","10th","11th","12th"]]

In [68]:
# Remove the index name.
reading_scores_by_grade.index.name = None

In [69]:
reading_scores_by_grade

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.3,80.9,80.9,80.9
Cabrera High School,83.7,84.3,83.8,84.3
Figueroa High School,81.2,81.4,80.6,81.4
Ford High School,80.6,81.3,80.4,80.7
Griffin High School,83.4,83.7,84.3,84.0
Hernandez High School,80.9,80.7,81.4,80.9
Holden High School,83.7,83.3,83.8,84.7
Huang High School,81.3,81.5,81.4,80.3
Johnson High School,81.3,80.8,80.6,81.2
Pena High School,83.8,83.6,84.3,84.6
