# PyCitySchools_Original MODIFIED for ease of use
<p><span style="color:blue">See PyCitySchools_Original for full details behind code below</span>.</p>

In [1]:
# Add the Pandas dependency
import pandas as pd

# Declare a variable for each csv
# Files to load
school_data_to_load = "Resources/schools_complete.csv"

student_data_to_load = "Resources/students_complete.csv"

# Store as a df
school_data_df = pd.read_csv(school_data_to_load)

student_data_df = pd.read_csv(student_data_to_load)

# Cleaned in other notebook ... have to get rid of prefixes and suffixes
#  Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

# Run all the necessary code to create the updated set of data
# Merge the two datasets
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])
# Count the number of students using a single column (since know from before that no missing b/c NaNs -> 85)
student_count = school_data_complete_df["Student ID"].count()
# Count number of schools (used more complicated version as coding lesson - unique + len; skipped easy below)
school_count = len(school_data_complete_df["school_name"].unique())
# school_count = school_data_df["school_name"].count()
# Calculate the total budget
total_budget = school_data_df["budget"].sum()
# Get the average subject score - mean()
average_math_score = school_data_complete_df["math_score"].mean()
average_reading_score = school_data_complete_df["reading_score"].mean()
# Get the overall passing percentage
# 1. Determine passing grade
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70
# 2. Get all the students who are passing subject into a new df
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]
# 3. Get the number of students passing the subject
passing_math_count = passing_math["student_name"].count()
passing_reading_count = passing_reading["student_name"].count()
# 4. Get percentage who passed the subject (convert student_count to float b/c %)
passing_math_percentage = passing_math_count / float(student_count) * 100
passing_reading_percentage = passing_reading_count / float(student_count) * 100
# 5. Now calculate overall % with logical operator "&" applied to >=70 math and reading
# passed both / total students
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) 
                                               & (school_data_complete_df["reading_score"] >= 70)]
# 6. Then calculate the number of students who passed both math and reading.
overall_passing_math_reading_count = passing_math_reading["student_name"].count()
# 7. Finally, calculate the overall passing percentage.
overall_passing_percentage = overall_passing_math_reading_count / student_count * 100

# Use all of this to create a new district summary df by converting the list of dictionaries into a df
# Add a list of values with keys to create the new df
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])

district_summary_df



Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


## Formatting Needed
<p><span style="color:blue">Fewer errors with use of functions</span></p>

In [2]:
# Define a function that calculates the percentage of students who passed both 
# math and reading and returns the passing percentage when the function is called.
#def passing_math_percent(pass_math_count, student_count):
#    return pass_math_count / float(student_count) * 100

#passing_math_count = 29370
#total_student_count = 39170

#passing_math_percent(passing_math_count, total_student_count)

# Can chain map() and format() fxns
# Use basic syntax of df["column"] = df["column"].map("{:,}".format)

# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)

# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$".
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)

# Format the columns.
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

district_summary_df

# No need to reorder from current output

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65


## School-level data
<p><span style="color:blue">Modifying and/or reordering the existing columns</span>.</p>

In [3]:
# STEP 1: Convert school_name to be the index - now must use school_name with index throughout to modify all
# Determine the school type.
per_school_types = school_data_df.set_index(["school_name"])["type"]
# Add the per_school_types into a new df by converting the series
df = pd.DataFrame(per_school_types)

# STEP 2: Calculate total student count per school - will need to add school_name as index b/c not in current dfs
# size in school_data_df
per_school_counts =school_data_df.set_index(["school_name"])["size"]
# No counts in this df, but can use value_counts to count the number of times school_name appears as alternative
# per_school_counts = school_data_complete_df["school_name"].value_counts()

# STEP 3: Get the budget per student (budget per school / number of students at school)
# Calculate the total school budget
per_school_budget = school_data_df.set_index(["school_name"])["budget"]
# The calculate the per capita spending (easy b/c budget and counts are (a) series, (b) int64 (c) have school as index).
per_school_capita = per_school_budget / per_school_counts

# STEP 4: Get the Score Averages Per School
# Calculate the subject scores; index on school_name
# Be advised: Don't want individual info ... want grouping - groupby() + math - the mean
per_school_math = school_data_complete_df.groupby(["school_name"]).mean()["math_score"]
per_school_reading = school_data_complete_df.groupby(["school_name"]).mean()["reading_score"]

# STEP 5: Get passing percentages per school
 # Psuedocode - 
 # 1. Determine what is the passing grade...We know it's >=70.
 # 2. Get the number of students who passed math and reading...We have earlier code.
#     passing_math = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)]
#     passing_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)]
 # 3. Get the students who passed math and passed reading... but need the per-school average
# Calculate the passing scores by creating a filtered DataFrame.
per_school_passing_math = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)]
per_school_passing_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)]
    # Translation: Need to perform a mathematical operation - groupby() - and we need numbers - count of # of students
# Calculate the number of students passing math and passing reading by school.
per_school_passing_math = per_school_passing_math.groupby(["school_name"]).count()["student_name"]
per_school_passing_reading = per_school_passing_reading.groupby(["school_name"]).count()["student_name"]
# Adjust to a percentage
per_school_passing_math = per_school_passing_math / per_school_counts * 100
per_school_passing_reading = per_school_passing_reading / per_school_counts * 100

# STEP 6: Get overall passing percentage for all students at each school
# Combine information into a single ?list ?df b/c sourced from existing df??
per_passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) 
                                                   & (school_data_complete_df["reading_score"] >= 70)]
# Calculate the number of students who passed both math and reading.
per_passing_math_reading = per_passing_math_reading.groupby(["school_name"]).count()["student_name"]
# Calculate the overall passing percentage.
per_overall_passing_percentage = per_passing_math_reading / per_school_counts * 100

# FINAL STEP: COMBINE ALL DATA INTO A NEW DATAFRAME AND THEN CLEAN IT UP
# Adding a list of values with keys to create a new DataFrame.
per_school_summary_df = pd.DataFrame({
             "School Type": per_school_types,
             "Total Students": per_school_counts,
             "Total School Budget": per_school_budget,
             "Per Student Budget": per_school_capita,
             "Average Math Score": per_school_math,
           "Average Reading Score": per_school_reading,
           "% Passing Math": per_school_passing_math,
           "% Passing Reading": per_school_passing_reading,
           "% Overall Passing": per_overall_passing_percentage})

per_school_summary_df.head()

# No need to reorder from current output

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,54.642283
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,54.289887
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,90.599455


### <span style="color:red">This is the end of PyCitySchools_Original.</span>

## Understand Highest and Lowest Performing School
<span style="color:blue">To assist with resource allocation</span>

In [4]:
# Highest-performing based on the overall percentage of passing students
# 1. Sort the per_school_summary_df on the % Overall Passing - sort_values(), ascending (F = default)
top_schools = per_school_summary_df.sort_values(["% Overall Passing"], ascending=False)
bottom_schools = per_school_summary_df.sort_values(["% Overall Passing"], ascending=True)

top_schools.head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
Thomas High School,Charter,1635,1043130,638.0,83.418349,83.84893,93.272171,97.308869,90.948012
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,90.582567
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,90.540541


In [5]:
bottom_schools.head(3)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363,637.0,76.842711,80.744686,66.366592,80.220055,52.988247
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
