# In this notebook, we will read csv files containing data about various schools and students thereof, and create dataframes therefrom in order to help the school board and mayor make strategic decisions regarding future school budgets and priorities.

In [3]:
# We will import the pandas package, so that we can read csv files, create dataframes therefrom, and perform data analysis.
# We will also import numpy to assist in mathematical computations.
import pandas as pd
import numpy as np

pd.options.display.float_format = "{:,}".format

In [4]:
# This creates file paths to our school data and student data csv files, so that we can later create dataframes therefrom.
schools_complete_file_path = "Resources/schools_complete.csv"
students_complete_file_path = "Resources/students_complete.csv"

In [5]:
schools_dataframe = pd.read_csv(schools_complete_file_path)
students_dataframe = pd.read_csv(students_complete_file_path)

In [6]:
schools_dataframe.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [7]:
students_dataframe.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [8]:
students_schools_merged_dataframe = pd.merge(students_dataframe, schools_dataframe, how="left", on=["school_name", "school_name"])
students_schools_merged_dataframe.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [9]:
# By using the .nunique() method, we count the number of unique values in the school_name column; this is the total number of schools.
total_schools = students_schools_merged_dataframe["school_name"].nunique()

# By using the .count() method, we count the number of fields in the student_name column; this is the total number of students.
total_students = students_schools_merged_dataframe["student_name"].count()

# By using the .sum() method, we calculate the sum of each school budget.  We must first use the .unique() method, to ensure that we do not add the budget from the same school more than once.
total_budget = students_schools_merged_dataframe["budget"].unique().sum()

# By using the .mean method, we calculate the average math score of every student.
average_math = students_schools_merged_dataframe["math_score"].mean()

# By using the .mean method, we calculate the average math score of every student.
average_math = students_schools_merged_dataframe["math_score"].mean()

# By using the .mean method, we calculate the average reading score of every student.
average_reading = students_schools_merged_dataframe["reading_score"].mean()

# By using the function average from the numpy package, we can calculate the average of the values written therein.
overall_passing_rate = np.mean([average_math, average_reading])

# We ust a pandas feature that allows us to count the number of values in a column based on a condition- in our case, grades greater than or equal to 70- then we use the len() function to get the length of this list.
# We then divide this number by the length of the entire list, and multiply of 100, giving us the percent of passing grades.
percent_passing_math = len(students_schools_merged_dataframe[students_schools_merged_dataframe["math_score"] >= 70]) / students_schools_merged_dataframe["math_score"].count() * 100

# We ust a pandas feature that allows us to count the number of values in a column based on a condition- in our case, grades greater than or equal to 70- then we use the len() function to get the length of this list.
# We then divide this number by the length of the entire list, and multiply of 100, giving us the percent of passing grades.
percent_passing_reading = len(students_schools_merged_dataframe[students_schools_merged_dataframe["reading_score"] >= 70]) / students_schools_merged_dataframe["reading_score"].count() * 100

In [10]:
district_summary_data = [{"Total Schools":total_schools,
                          "Total Students":total_students,
                          "Total Budget":total_budget,
                         "Average Math Score":average_math,
                         "Average Reading Score":average_reading,
                         "Percent Overall Passing Rate":overall_passing_rate,
                         "Percent Passing Math":percent_passing_math,
                         "Percent Passing Reading":percent_passing_reading}]

In [11]:
district_summary = pd.DataFrame(data=district_summary_data, columns=["Total Schools",
                                                                     "Total Students",
                                                                    "Total Budget",
                                                                    "Average Math Score",
                                                                    "Average Reading Score",
                                                                    "Percent Overall Passing Rate",
                                                                    "Percent Passing Math",
                                                                    "Percent Passing Reading"])
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Percent Overall Passing Rate,Percent Passing Math,Percent Passing Reading
0,15,39170,24649428,78.98537145774827,81.87784018381414,80.43160582078121,74.9808526933878,85.80546336482001


## In this section, we create a dataframe that summarizes data per school.

In [12]:
school_names = [name for name in schools_dataframe["school_name"]]
school_types = [type for type in schools_dataframe["type"]]
total_students = [size for size in schools_dataframe["size"]]
budgets = [budget for budget in schools_dataframe["budget"]]

average_math_scores = students_dataframe.groupby("school_name", sort=False)["math_score"].mean()
average_reading_scores = students_dataframe.groupby("school_name", sort=False)["reading_score"].mean()

percent_passing_math_by_school = students_dataframe[(students_dataframe["math_score"] >= 70)].groupby("school_name", sort=False)["math_score"].count() / students_dataframe.groupby("school_name", sort=False)["math_score"].count() * 100
percent_passing_reading_by_school = students_dataframe[(students_dataframe["reading_score"] >= 70)].groupby("school_name", sort=False)["reading_score"].count() / students_dataframe.groupby("school_name", sort=False)["math_score"].count() * 100

In [13]:
percent_passing_math_list = [percent for percent in percent_passing_math_by_school]

In [14]:
type(percent_passing_reading_by_school)

pandas.core.series.Series

In [15]:
average_math_scores = average_math_scores.tolist()
average_reading_scores = average_reading_scores.tolist()

percent_passing_reading_by_school = percent_passing_reading_by_school.tolist()

In [16]:
type(percent_passing_reading_by_school)

list

In [27]:
school_summary = pd.DataFrame()
school_summary["School Name"] = school_names
school_summary["School Type"] = school_types
school_summary["Total Students"] = total_students
school_summary["Total Budget"] = budgets

school_summary["Per Student Budget"] = school_summary["Total Budget"] / school_summary["Total Students"]

school_summary["Average Math Score"] = average_math_scores
school_summary["Average Reading Score"] = average_reading_scores

school_summary["Percent Passing Math"] = percent_passing_math_list
school_summary["Percent Passing Reading"] = percent_passing_reading_by_school

school_summary["Overall Passing Rate"] = school_summary[["Percent Passing Math", "Percent Passing Reading"]].mean(axis=1)

school_summary = school_summary.set_index(["School Name"])

In [28]:
school_summary

Unnamed: 0_level_0,School Type,Total Students,Total Budget,Per Student Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,1910635,655.0,76.62941378128214,81.18272197463148,65.68392183750429,81.31642098045938,73.50017140898183
Figueroa High School,District,2949,1884411,639.0,76.71176670057646,81.15801966768396,65.98847066802306,80.73923363852154,73.36385215327229
Shelton High School,Charter,1761,1056600,600.0,83.3594548551959,83.72572402044293,93.8671209540034,95.85462805224304,94.86087450312324
Hernandez High School,District,4635,3022020,652.0,77.28975188781014,80.9344120819849,66.7529665587918,80.86299892125135,73.80798274002157
Griffin High School,Charter,1468,917500,625.0,83.35149863760218,83.816757493188,93.39237057220708,97.13896457765668,95.26566757493188
Wilson High School,Charter,2283,1319574,578.0,83.2742006132282,83.98948751642575,93.8677179150241,96.53964082347788,95.203679369251
Cabrera High School,Charter,1858,1081356,582.0,83.06189451022605,83.97578040904197,94.1334768568353,97.03982777179765,95.58665231431648
Bailey High School,District,4976,3124928,628.0,77.04843247588424,81.03396302250803,66.68006430868168,81.93327974276528,74.30667202572349
Holden High School,Charter,427,248087,581.0,83.80327868852459,83.81498829039812,92.50585480093676,96.25292740046838,94.37939110070258
Pena High School,Charter,962,585858,609.0,83.83991683991684,84.04469854469855,94.5945945945946,95.94594594594594,95.27027027027026


In [26]:
school_summary.nlargest(5, "Overall Passing Rate", keep='first')

Unnamed: 0_level_0,School Type,Total Students,Total Budget,Per Student Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356,582.0,83.06189451022605,83.97578040904197,94.1334768568353,97.03982777179765,95.58665231431648
Thomas High School,Charter,1635,1043130,638.0,83.4183486238532,83.84892966360856,93.27217125382263,97.30886850152906,95.29051987767583
Pena High School,Charter,962,585858,609.0,83.83991683991684,84.04469854469855,94.5945945945946,95.94594594594594,95.27027027027026
Griffin High School,Charter,1468,917500,625.0,83.35149863760218,83.816757493188,93.39237057220708,97.13896457765668,95.26566757493188
Wilson High School,Charter,2283,1319574,578.0,83.2742006132282,83.98948751642575,93.8677179150241,96.53964082347788,95.203679369251


In [25]:
school_summary.nsmallest(5, "Overall Passing Rate", keep="first")

Unnamed: 0_level_0,School Type,Total Students,Total Budget,Per Student Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363,637.0,76.84271067766942,80.74468617154288,66.36659164791197,80.22005501375344,73.2933233308327
Figueroa High School,District,2949,1884411,639.0,76.71176670057646,81.15801966768396,65.98847066802306,80.73923363852154,73.36385215327229
Huang High School,District,2917,1910635,655.0,76.62941378128214,81.18272197463148,65.68392183750429,81.31642098045938,73.50017140898183
Johnson High School,District,4761,3094650,650.0,77.07246376811594,80.96639361478681,66.0575509346776,81.2224322621298,73.6399915984037
Ford High School,District,2739,1763916,644.0,77.10259218692954,80.74625775830594,68.3096020445418,79.29901423877328,73.80430814165754
