In [2]:
# Import the dependencies 
import pandas as pd

In [3]:
# Establish a connection to the source data file and then read the data into dataframes

original_school_data = "Resources/schools_complete.csv"
original_student_data = "Resources/students_complete.csv"

school_information = pd.read_csv(original_school_data)
student_information = pd.read_csv(original_student_data)

In [4]:
# Combine the data from both files into a single dataframe 

school_student_data = pd.merge(school_information, student_information, how="left", on=["school_name", "school_name"])

# Trim the data to only include the District schools

District_df = school_student_data.loc[school_student_data["type"] == "District"]

District_df

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...,...,...,...,...,...
37530,13,Ford High School,District,2739,1763916,37530,William Rivera,M,9th,63,59
37531,13,Ford High School,District,2739,1763916,37531,Bernard Reid,M,9th,94,80
37532,13,Ford High School,District,2739,1763916,37532,Dawn Sawyer,F,12th,91,66
37533,13,Ford High School,District,2739,1763916,37533,Thomas Knapp,M,12th,75,70


In [5]:
# Calculate the total number of District Schools
total_Dschools_list = District_df[["School ID"]]

total_Dschools = total_Dschools_list.nunique()

# Display the total number of the schools listed  
total_Dschools

School ID    7
dtype: int64

In [6]:
# Calculate the total number of District students
total_Dstudents_list = District_df[['Student ID']]

total_Dstudents = total_Dstudents_list.nunique()

# Display the total number of students
total_Dstudents 

Student ID    26976
dtype: int64

In [10]:
# Calculate the total District budget 
District_budgets = school_information.loc[school_information['type'] == "District"]


Total_District_budget = District_budgets["budget"].sum()

Total_District_budget


17347923

In [11]:
# Calculate the average math score for the District 
District_math = District_df[["math_score"]]

avg_District_math = District_math.mean()

avg_District_math


math_score    76.987026
dtype: float64

In [12]:
# Calculate the average reading score for the District

District_reading = District_df[["reading_score"]]

avg_District_reading = District_reading.mean()

avg_District_reading

reading_score    80.962485
dtype: float64

In [58]:
# % passing math (the percentage of District students who passed math)
bins = [0,59.9,100]

group_names = ["Fail", "Pass"]

# Create a new column that will represent the records that passed or failed for math
District_df["Passing/Failing Math"] = pd.cut(District_df["math_score"], bins, labels =group_names)

District_df

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score,Passing/Failing Math
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,Pass
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,Pass
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,Pass
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58,Fail
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...
37531,13,Ford High School,District,2739,1763916,37531,Bernard Reid,M,9th,94,80,Pass
37532,13,Ford High School,District,2739,1763916,37532,Dawn Sawyer,F,12th,91,66,Pass
37533,13,Ford High School,District,2739,1763916,37533,Thomas Knapp,M,12th,75,70,Pass
37534,13,Ford High School,District,2739,1763916,37534,Melissa Porter,F,9th,76,84,Pass


In [62]:
# Remove the NaN row 
District_df = District_df.dropna(how = 'all')

District_df

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score,Passing/Failing Math
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,Pass
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,Pass
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,Pass
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58,Fail
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...
37530,13,Ford High School,District,2739,1763916,37530,William Rivera,M,9th,63,59,Fail
37531,13,Ford High School,District,2739,1763916,37531,Bernard Reid,M,9th,94,80,Pass
37532,13,Ford High School,District,2739,1763916,37532,Dawn Sawyer,F,12th,91,66,Pass
37533,13,Ford High School,District,2739,1763916,37533,Thomas Knapp,M,12th,75,70,Pass


In [82]:
# Filter the Passing/Failing Math column to include only the passing students in the dataframe

District_passing_math = District_df.loc[District_df["Passing/Failing Math"] == "Pass"]

District_passing_math

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score,Passing/Failing Math,Passing/Failing Reading
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,Pass,Pass
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,Pass,Pass
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,Pass,Pass
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,Pass,Pass
5,0,Huang High School,District,2917,1910635,5,Bryan Miranda,M,9th,94,94,Pass,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37529,13,Ford High School,District,2739,1763916,37529,Victoria Rodriguez,F,11th,76,69,Pass,Pass
37531,13,Ford High School,District,2739,1763916,37531,Bernard Reid,M,9th,94,80,Pass,Pass
37532,13,Ford High School,District,2739,1763916,37532,Dawn Sawyer,F,12th,91,66,Pass,Pass
37533,13,Ford High School,District,2739,1763916,37533,Thomas Knapp,M,12th,75,70,Pass,Pass


In [83]:
# Total number of passing math grades in the District
count_District_passing_math = District_passing_math['Passing/Failing Math'].value_counts()

total_District_passing_math = count_District_passing_math.sum()

total_District_passing_math

24017

In [84]:

# Calculate the total number of math students
District_Students = District_df['Passing/Failing Math'].value_counts()

total_District_math = District_Students.sum()

# Display the total number of students
total_District_math 

26976

In [85]:
# Percentage of passing math grades 

District_passing_math = '{:.2%}'.format(total_District_passing_math/total_District_Students) 

District_passing_math

'89.03%'

In [86]:
# % passing math (the percentage of District students who passed math)
bins = [0,59.9,100]

group_names = ["Fail", "Pass"]

# Create a new column that will represent the records that passed or failed for math
District_df["Passing/Failing Reading"] = pd.cut(District_df["reading_score"], bins, labels =group_names)

District_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  District_df["Passing/Failing Reading"] = pd.cut(District_df["reading_score"], bins, labels =group_names)


Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score,Passing/Failing Math,Passing/Failing Reading
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,Pass,Pass
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,Pass,Pass
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,Pass,Pass
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58,Fail,Pass
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,Pass,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37530,13,Ford High School,District,2739,1763916,37530,William Rivera,M,9th,63,59,Fail,Pass
37531,13,Ford High School,District,2739,1763916,37531,Bernard Reid,M,9th,94,80,Pass,Pass
37532,13,Ford High School,District,2739,1763916,37532,Dawn Sawyer,F,12th,91,66,Pass,Pass
37533,13,Ford High School,District,2739,1763916,37533,Thomas Knapp,M,12th,75,70,Pass,Pass


In [87]:
# Filter the Passing/Failing Reading column to include only the passing students in the dataframe

District_passing_reading = District_df.loc[District_df["Passing/Failing Reading"] == "Pass"]


# Total number of passing grades
count_District_passing_reading = District_passing_reading['Passing/Failing Reading'].value_counts()

total_District_passing_reading = count_District_passing_reading.sum()

total_District_passing_reading


26976

In [88]:
# Total number of reading grades

count_District_reading = District_df["Passing/Failing Reading"].value_counts()

total_District_reading = count_District_reading.sum()

total_District_reading

26976

In [89]:
# Percentage of passing reading grades 

District_pass_reading = '{:.0%}'.format(total_District_passing_reading/total_District_reading)

District_pass_reading

'100%'

In [90]:
total_overall_passing = '{:.0%}'.format((total_District_passing_math + total_District_passing_reading)/(total_District_math + total_District_reading))

total_overall_passing

'95%'