In [1]:
#prepare - bring in required modules
import pandas as pd
from pathlib import Path

In [2]:
#define csv filepaths
schools_csv = Path("Resources/schools_complete.csv")
students_csv = Path("Resources/students_complete.csv")

#read csv's
schools_data = pd.read_csv(schools_csv)
students_data = pd.read_csv(students_csv)

In [3]:
#view headers to find common column for merging
schools_data.columns

Index(['School ID', 'school_name', 'type', 'size', 'budget'], dtype='object')

In [4]:
#view headers to find common column for merging
students_data.columns

Index(['Student ID', 'student_name', 'gender', 'year', 'school_name',
       'reading_score', 'maths_score'],
      dtype='object')

In [5]:
#merge csv's and view
schools_merged_df = pd.merge(schools_data, students_data, on=["school_name"], how="left")
schools_merged_df.head()

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,year,reading_score,maths_score
0,0,Huang High School,Government,2917,1910635,0,Paul Bradley,M,9,96,94
1,0,Huang High School,Government,2917,1910635,1,Victor Smith,M,12,90,43
2,0,Huang High School,Government,2917,1910635,2,Kevin Rodriguez,M,12,41,76
3,0,Huang High School,Government,2917,1910635,3,Richard Scott,M,12,89,86
4,0,Huang High School,Government,2917,1910635,4,Bonnie Ray,F,9,87,69


In [14]:
#run calculations on merged data

#total schools
total_schools = schools_merged_df["school_name"].nunique()
total_schools

#total students
total_students = schools_merged_df["Student ID"].nunique()
total_students

#total budget - this is repeated on each line, so can't rely on the sum
first_schools = schools_merged_df.groupby("school_name").first()
total_budget = first_schools["budget"].sum()
total_budget

#average maths score
av_maths_score = schools_merged_df["maths_score"].mean()
av_maths_score

#average reading score
av_read_score = schools_merged_df["reading_score"].mean()
av_read_score

#percentage passing maths - 50% or higer
maths_pass = schools_merged_df.loc[schools_merged_df["maths_score"] >= 50]
maths_percent_pass = (len(maths_pass)/total_students)*100
maths_percent_pass

#percentage passing reading - 50% or higher
read_pass = schools_merged_df.loc[schools_merged_df["reading_score"] >= 50]
read_percent_pass = (len(read_pass)/total_students)*100
read_percent_pass

#percentage passing both maths and reading
both_pass = schools_merged_df.loc[(schools_merged_df["maths_score"] >= 50) & (schools_merged_df["reading_score"] >= 50)]
both_percent_pass = (len(both_pass)/total_students)*100
both_percent_pass

72.80827163645647

In [16]:
#show calulcations in a dataframe

#collate results
lga_summary = [{"Total Schools": total_schools, "Total Students": total_students,\
                "Total Budget": total_budget, "Average Maths Score": av_maths_score,\
                "Average Reading Score": av_read_score, "% Passing Maths": maths_percent_pass,\
                "% Passing Reading": read_percent_pass, "% Overall Pass": both_percent_pass}]

#create dataframe
lga_summary_df = pd.DataFrame(lga_summary)

#format required columns for decimal places and currency
lga_summary_df["Total Students"] = lga_summary_df["Total Students"].astype(int).map("{:,}".format)
lga_summary_df["Total Budget"] = lga_summary_df["Total Budget"].astype(float).map("${:,.2f}".format)

#view
lga_summary_df


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Pass
0,15,39170,"$24,649,428.00",70.338192,69.980138,86.078632,84.426857,72.808272
