In [200]:
#prepare - bring in required modules
import pandas as pd
from pathlib import Path

In [201]:
#define csv filepaths
schools_csv = Path("Resources/schools_complete.csv")
students_csv = Path("Resources/students_complete.csv")

#read csv's
schools_data = pd.read_csv(schools_csv)
students_data = pd.read_csv(students_csv)

In [202]:
#view headers to find common column for merging
schools_data.columns

Index(['School ID', 'school_name', 'type', 'size', 'budget'], dtype='object')

In [203]:
#view headers to find common column for merging
students_data.columns

Index(['Student ID', 'student_name', 'gender', 'year', 'school_name',
       'reading_score', 'maths_score'],
      dtype='object')

In [204]:
#merge csv's and view
schools_merged_df = pd.merge(schools_data, students_data, on=["school_name"], how="left")
#schools_merged_df.head()

## Local Government Area Summary

In [205]:
#run calculations on merged data

#total schools
total_schools = schools_merged_df["school_name"].nunique()
#total_schools

#total students
total_students = schools_merged_df["Student ID"].nunique()
#total_students

#total budget - this is repeated on each line, so can't rely on the sum
first_schools = schools_merged_df.groupby("school_name").first()
total_budget = first_schools["budget"].sum()
#total_budget

#average maths score
av_maths_score = schools_merged_df["maths_score"].mean()
#av_maths_score

#average reading score
av_read_score = schools_merged_df["reading_score"].mean()
#av_read_score

#percentage passing maths - 50% or higer
maths_pass = schools_merged_df.loc[schools_merged_df["maths_score"] >= 50]
maths_percent_pass = (len(maths_pass)/total_students)*100
#maths_percent_pass

#percentage passing reading - 50% or higher
read_pass = schools_merged_df.loc[schools_merged_df["reading_score"] >= 50]
read_percent_pass = (len(read_pass)/total_students)*100
#read_percent_pass

#percentage passing both maths and reading
both_pass = schools_merged_df.loc[(schools_merged_df["maths_score"] >= 50) & (schools_merged_df["reading_score"] >= 50)]
both_percent_pass = (len(both_pass)/total_students)*100
#both_percent_pass

In [206]:
#show calulcations in a dataframe

#collate results
lga_summary = [{"Total Schools": total_schools, "Total Students": total_students,\
                "Total Budget": total_budget, "Average Maths Score": av_maths_score,\
                "Average Reading Score": av_read_score, "% Passing Maths": maths_percent_pass,\
                "% Passing Reading": read_percent_pass, "% Overall Pass": both_percent_pass}]

#create dataframe
area_summary_df = pd.DataFrame(lga_summary)

#format required columns for decimal places and currency
area_summary_df["Total Students"] = area_summary_df["Total Students"].astype(int).map("{:,}".format)
area_summary_df["Total Budget"] = area_summary_df["Total Budget"].astype(float).map("${:,.2f}".format)

### Local Government Area Summary - Results

In [207]:
#view
area_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Pass
0,15,39170,"$24,649,428.00",70.338192,69.980138,86.078632,84.426857,72.808272


## School Summary

In [208]:
#group merged data by school name
schools_grouped = schools_merged_df.groupby(["school_name"])

In [209]:
#calculations per school

#total students
school_students = pd.DataFrame(schools_grouped["Student ID"].nunique())
school_students = school_students.rename(columns={"Student ID": "Total Students"})
#school_students

#total school budget
first_school = schools_merged_df.groupby("school_name").first()
school_budget = first_school["budget"]

#per student budget
school_size = first_school["size"]
per_student = pd.DataFrame(school_budget/school_size)
per_student = per_student.rename(columns={0: "Per Student Budget"})
#per_student

#average maths score
school_av_maths_score = pd.DataFrame(schools_grouped["maths_score"].mean())
school_av_maths_score = school_av_maths_score.rename(columns={"maths_score": "Average Maths Score"})
#school_av_maths_score

#average reading score
school_av_read_score = pd.DataFrame(schools_grouped["reading_score"].mean())
school_av_read_score = school_av_read_score.rename(columns={"reading_score": "Average Reading Score"})
#school_av_read_score

#percentage passing maths
math_group_pass_df = schools_merged_df.loc[(schools_merged_df["maths_score"] >= 50)]
maths_grouped_df = math_group_pass_df.groupby(["school_name"])
math_pass_sch = maths_grouped_df["maths_score"].count()
math_attempted = schools_grouped["maths_score"].count()
math_percent_final = (math_pass_sch / math_attempted)*100
#math_percent_final

#percentage passing reading
read_group_pass_df = schools_merged_df.loc[(schools_merged_df["reading_score"] >= 50)]
read_grouped_df = read_group_pass_df.groupby(["school_name"])
read_pass_sch = read_grouped_df["reading_score"].count()
read_attempted = schools_grouped["reading_score"].count()
read_percent_final = (read_pass_sch / read_attempted)*100
#read_percent_final

#percentage passing both maths and reading
all_group_pass_df = schools_merged_df.loc[(schools_merged_df["maths_score"] >= 50)\
                                          & (schools_merged_df["reading_score"] >= 50)]
all_grouped_df = all_group_pass_df.groupby(["school_name"])
all_pass_sch = all_grouped_df["Student ID"].count()
all_attempted = schools_grouped["Student ID"].count()
all_percent_final = (all_pass_sch / all_attempted)*100
#all_percent_final

In [210]:
#show calculations in a dataframe

#grab required data not based off a calculation
school_type = schools_grouped["type"].first()
school_budget = schools_grouped["budget"].first()

#collate results
school_results = [school_type, school_students, school_budget, per_student, school_av_maths_score,\
                  school_av_read_score, math_percent_final, read_percent_final, all_percent_final]

#bring data together
school_summary = pd.concat(school_results, axis=1)

#rename columns
school_summary = school_summary.rename(columns={"type": "School Type", "budget": "Total School Budget",\
                                                "maths_score": "% Passing Maths", "reading_score": "% Passing Reading",\
                                                "Student ID": "% Overall Passing"})

#create datafrome from collated results
per_school_summary_df = pd.DataFrame(school_summary)

#format data for currency
per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].astype(float).map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].astype(float).map("${:,.2f}".format)

### School Summary - Results

In [211]:
#view
per_school_summary_df

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,Government,4976,"$3,124,928.00",$628.00,72.352894,71.008842,91.639871,87.379421,80.084405
Cabrera High School,Independent,1858,"$1,081,356.00",$582.00,71.657158,71.359526,90.850377,89.074273,80.785791
Figueroa High School,Government,2949,"$1,884,411.00",$639.00,68.698542,69.077993,81.654798,82.807731,67.650051
Ford High School,Government,2739,"$1,763,916.00",$644.00,69.091274,69.572472,82.438846,82.219788,67.46988
Griffin High School,Independent,1468,"$917,500.00",$625.00,71.788147,71.245232,91.212534,88.487738,81.33515
Hernandez High School,Government,4635,"$3,022,020.00",$652.00,68.874865,69.186408,80.949299,81.877023,66.364617
Holden High School,Independent,427,"$248,087.00",$581.00,72.583138,71.660422,89.929742,88.52459,78.922717
Huang High School,Government,2917,"$1,910,635.00",$655.00,68.935207,68.910525,81.693521,81.453548,66.712376
Johnson High School,Government,4761,"$3,094,650.00",$650.00,68.8431,69.039277,82.062592,81.978576,67.191766
Pena High School,Independent,962,"$585,858.00",$609.00,72.088358,71.613306,91.683992,86.590437,79.209979


## Top Performing Schools (By % Overall Passing)

In [212]:
#sort and show the highest performing schools overall
top_schools = school_summary.sort_values("% Overall Passing", ascending=False)
top_schools.head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Griffin High School,Independent,1468,"$917,500.00",$625.00,71.788147,71.245232,91.212534,88.487738,81.33515
Cabrera High School,Independent,1858,"$1,081,356.00",$582.00,71.657158,71.359526,90.850377,89.074273,80.785791
Bailey High School,Government,4976,"$3,124,928.00",$628.00,72.352894,71.008842,91.639871,87.379421,80.084405
Wright High School,Independent,1800,"$1,049,400.00",$583.00,72.047222,70.969444,91.777778,86.666667,79.722222
Rodriguez High School,Government,3999,"$2,547,363.00",$637.00,72.047762,70.935984,90.797699,87.396849,79.419855


## Bottom Performing Schools (By % Overall Passing)

In [213]:
#sort and show the lowest performing schools overall
bottom_schools = school_summary.sort_values("% Overall Passing", ascending=True)
bottom_schools.head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hernandez High School,Government,4635,"$3,022,020.00",$652.00,68.874865,69.186408,80.949299,81.877023,66.364617
Huang High School,Government,2917,"$1,910,635.00",$655.00,68.935207,68.910525,81.693521,81.453548,66.712376
Johnson High School,Government,4761,"$3,094,650.00",$650.00,68.8431,69.039277,82.062592,81.978576,67.191766
Wilson High School,Independent,2283,"$1,319,574.00",$578.00,69.170828,68.876916,82.785808,81.29654,67.455103
Ford High School,Government,2739,"$1,763,916.00",$644.00,69.091274,69.572472,82.438846,82.219788,67.46988


## Maths Scores by Year

In [214]:
#dataframe for maths results per year

#results for year 9s
maths_nine = schools_merged_df.loc[(schools_merged_df['year'] == 9)]
maths_nine_school = pd.DataFrame(maths_nine.groupby(["school_name"])["maths_score"].mean())
maths_nine_school = maths_nine_school.rename(columns={"maths_score": "Year 9"})
#maths_nine_school

#results for year 10s
maths_ten = schools_merged_df.loc[(schools_merged_df['year'] == 10)]
maths_ten_school = pd.DataFrame(maths_ten.groupby(["school_name"])["maths_score"].mean())
maths_ten_school = maths_ten_school.rename(columns={"maths_score": "Year 10"})
#maths_ten_school

#reulsts for year 11s
maths_eleven = schools_merged_df.loc[(schools_merged_df['year'] == 11)]
maths_eleven_school = pd.DataFrame(maths_eleven.groupby(["school_name"])["maths_score"].mean())
maths_eleven_school = maths_eleven_school.rename(columns={"maths_score": "Year 11"})
#maths_eleven_school

#results for year 12s
maths_twelve = schools_merged_df.loc[(schools_merged_df['year'] == 12)]
maths_twelve_school = pd.DataFrame(maths_twelve.groupby(["school_name"])["maths_score"].mean())
maths_twelve_school = maths_twelve_school.rename(columns={"maths_score": "Year 12"})
#maths_twelve_school

#collate results
maths_years = [maths_nine_school, maths_ten_school, maths_eleven_school, maths_twelve_school]

#bring data together
maths_scores_by_year_df = pd.concat(maths_years,axis=1)

#create dataframe
maths_scores_by_year_df = pd.DataFrame(maths_years_summary)

### Maths Score by Year - Results

In [215]:
#view
maths_scores_by_year_df

Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,72.493827,71.897498,72.3749,72.675097
Cabrera High School,72.32197,72.437768,71.008299,70.604712
Figueroa High School,68.477804,68.331586,68.811001,69.325282
Ford High School,69.021609,69.387006,69.248862,68.617811
Griffin High School,72.789731,71.093596,71.692521,71.469178
Hernandez High School,68.586831,68.867156,69.154412,68.985075
Holden High School,70.543307,75.105263,71.640777,73.409639
Huang High School,69.081754,68.533246,69.431345,68.639316
Johnson High School,69.469286,67.99022,68.63773,69.287393
Pena High School,71.996364,72.396,72.523438,71.187845


## Reading Score by Year 

In [216]:
#dataframe for reading results per year

#results for year 9s
read_nine = schools_merged_df.loc[(schools_merged_df['year'] == 9)]
read_nine_school = pd.DataFrame(read_nine.groupby(["school_name"])["reading_score"].mean())
read_nine_school = read_nine_school.rename(columns={"reading_score": "Year 9"})
#read_nine_school

#results for year 10s
read_ten = schools_merged_df.loc[(schools_merged_df['year'] == 10)]
read_ten_school = pd.DataFrame(read_ten.groupby(["school_name"])["reading_score"].mean())
read_ten_school = read_ten_school.rename(columns={"reading_score": "Year 10"})
#read_ten_school

#results for year 11s
read_eleven = schools_merged_df.loc[(schools_merged_df['year'] == 11)]
read_eleven_school = pd.DataFrame(read_eleven.groupby(["school_name"])["reading_score"].mean())
read_eleven_school = read_eleven_school.rename(columns={"reading_score": "Year 11"})
#read_eleven_school

#results for year 12s
read_twelve = schools_merged_df.loc[(schools_merged_df['year'] == 12)]
read_twelve_school = pd.DataFrame(read_twelve.groupby(["school_name"])["reading_score"].mean())
read_twelve_school = read_twelve_school.rename(columns={"reading_score": "Year 12"})
#read_twelve_school

#collate results
read_years = [read_nine_school, read_ten_school, read_eleven_school, read_twelve_school]

#bring results together
reading_scores_by_year_df = pd.concat(read_years,axis=1)

#create dataframe
reading_scores_by_year_df = pd.DataFrame(read_years_summary)

### Reading Scores by Year - Results

In [217]:
#view dataframe
reading_scores_by_year_df

Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,70.90192,70.848265,70.317346,72.195525
Cabrera High School,71.172348,71.328326,71.201245,71.856021
Figueroa High School,70.261682,67.677588,69.152327,69.082126
Ford High School,69.615846,68.988701,70.735964,68.849722
Griffin High School,72.026895,70.746305,72.385042,69.434932
Hernandez High School,68.477569,70.621842,68.418199,69.244136
Holden High School,71.598425,71.096491,73.31068,70.481928
Huang High School,68.670616,69.516297,68.740638,68.671795
Johnson High School,68.719286,69.295029,69.969115,67.992521
Pena High School,70.949091,72.324,71.703125,71.513812


## Scores by School Spending 

In [218]:
#school performance based on spending

#define spending bins
spending_bins = [0,585,630,645,680]

#define spending labels
spending_labels = ["<$585", "$585-630", "$630-645", "$645-680"]

#convert string data to number, $ to be removed
school_summary["Per Student Budget"] = school_summary["Per Student Budget"].str.replace("$", "")
school_summary["Per Student Budget"] = school_summary["Per Student Budget"].astype(float)

#cut data into bins
school_summary["Spending Ranges (Per Student)"]= pd.cut(school_summary["Per Student Budget"],\
                                                           spending_bins, labels=spending_labels)

#review
#school_summary.head()


  school_summary["Per Student Budget"] = school_summary["Per Student Budget"].str.replace("$", "")


In [219]:
#calculations on the binned data

#average maths score
spending_math_scores = pd.DataFrame(school_summary.groupby(["Spending Ranges (Per Student)"])\
                                    ["Average Maths Score"].mean())
#spending_math_scores

#average reading score
spending_reading_scores = pd.DataFrame(school_summary.groupby(["Spending Ranges (Per Student)"])\
                                       ["Average Reading Score"].mean())
#spending_reading_scores

#aveage passing maths score
spending_passing_math = pd.DataFrame(school_summary.groupby(["Spending Ranges (Per Student)"])\
                                     ["% Passing Maths"].mean())
#spending_passing_math

#average passing reading score
spending_passing_reading = pd.DataFrame(school_summary.groupby(["Spending Ranges (Per Student)"])\
                                        ["% Passing Reading"].mean())
#spending_passing_reading

#average passing both score
overall_passing_spending = pd.DataFrame(school_summary.groupby(["Spending Ranges (Per Student)"])\
                                        ["% Overall Passing"].mean()) 
#overall_passing_spending

In [220]:
#view results as a dataframe

#collate results
spending_results = [spending_math_scores, spending_reading_scores, spending_passing_math, spending_passing_reading,\
                    overall_passing_spending]

#bring data together
spending_summary = pd.concat(spending_results, axis=1)

#create dataframe
spending_summary_df = pd.DataFrame(spending_summary)

### Scores by School Spending - Results 

In [221]:
#view
spending_summary_df

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,71.364587,70.716577,88.835926,86.390517,76.721458
$585-630,72.065868,71.031297,91.518824,87.292423,79.876293
$630-645,69.854807,69.838814,84.686139,83.763585,71.004977
$645-680,68.884391,69.045403,81.56847,81.769716,66.756253


## Scores by School Size

In [222]:
#school performance based on size

#define size bins
size_bins = [0, 1000, 2000, 5000]
#define size labels
size_labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

#cut data into bins
school_summary["School Size"]= pd.cut(school_summary["Total Students"], size_bins, labels=size_labels)

#review
#school_summary.head()

In [223]:
#calculations on binned data

#average maths score
size_math_scores = pd.DataFrame(school_summary.groupby(["School Size"])["Average Maths Score"].mean())
#size_math_scores

#average reading score
size_reading_scores = pd.DataFrame(school_summary.groupby(["School Size"])["Average Reading Score"].mean())
#size_reading_scores

#average passing maths score
size_passing_math = pd.DataFrame(school_summary.groupby(["School Size"])["% Passing Maths"].mean())
#size_passing_math

#average passing reading score
size_passing_reading = pd.DataFrame(school_summary.groupby(["School Size"])["% Passing Reading"].mean())
#size_passing_reading

#average overall passing score
overall_passing_size = pd.DataFrame(school_summary.groupby(["School Size"])["% Overall Passing"].mean()) 
#overall_passing_size

In [224]:
#view results as a dataframe

#collate results
size_results = [size_math_scores, size_reading_scores, size_passing_math, size_passing_reading, overall_passing_size]

#bring data together
size_summary = pd.concat(size_results, axis=1)

#create dataframe
size_summary_df = pd.DataFrame(size_summary)

### Scores by School Size - Results

In [225]:
#view
size_summary_df

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),72.335748,71.636864,90.806867,87.557513,79.066348
Medium (1000-2000),71.42165,70.720164,89.84656,86.714149,78.039785
Large (2000-5000),69.751809,69.576052,84.252804,83.301185,70.293507


## Scores by School Type

In [226]:
#performance based on school type
#no binning required as this is already given in the original data

#calulations based on school type

#average maths score
type_math_scores = pd.DataFrame(school_summary.groupby(["School Type"])["Average Maths Score"].mean())
#type_math_scores

#average reading score
type_reading_scores = pd.DataFrame(school_summary.groupby(["School Type"])["Average Reading Score"].mean())
#type_reading_scores

#average passing maths score
type_passing_math = pd.DataFrame(school_summary.groupby(["School Type"])["% Passing Maths"].mean())
#type_passing_math

#average passing reading score
type_passing_reading = pd.DataFrame(school_summary.groupby(["School Type"])["% Passing Reading"].mean())
#type_passing_reading

#average overall passing both
overall_passing_type = pd.DataFrame(school_summary.groupby(["School Type"])["% Overall Passing"].mean()) 
#overall_passing_type

In [227]:
#show reults in a dataframe

#collate results
type_results = [type_math_scores, type_reading_scores, type_passing_math, type_passing_reading, overall_passing_type]

#bring data together
type_summary = pd.concat(type_results, axis=1)

#create dataframe
type_summary_df = pd.DataFrame(type_summary)

### Scores by School Type - Results

In [228]:
#view
type_summary_df

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Government,69.834806,69.675929,84.462375,83.587562,70.698993
Independent,71.368822,70.718933,89.204043,86.247789,76.97334


## Summary & Observations


##### - Local Government Area Summary shows the number of schools & students in the area, the overall budget and how students are performing in maths and reading.
##### - The School Summary then breaks the LGA information down to a per school basis, giving opportunity to compare an individual schools result to the overall result. Top5 and bottom5 schools, based on overall passing average are highlighted.
##### - Information on each schools maths and reading results are presented, broken down by Year Level. This data can assist with identifying key areas of concern at a more detailed level.
##### - Scores broken down by budget per student, school size and school type give valuable information that may assist future decision making eg if another school in the area would be beneficial to reduce overall student numbers in the government schools, or if funding is to be redistributed

##### Larger schools are notably low performers. On "Scores by School Size" the large schools have the lowest results across the board. Also, all 5 schools in the "Bottom Performing Schools" are large. As large schools make up 8 of the 15 schools in this LGA it would be important to view individual schools. Bailey High School is the largest in the LGA but sits in the top 5 performers overall, an exception to the earlier observation.

##### Looking at "Scores by School Spending" it is interesting to note that the best performance is not attained by the higher per student budgets. Pair this piece of information with "Scores by School Type" showing Independent schools to be the better performers. All but 1 of the 8 Independent schools has a budget sitting in one of the bottom 2 per student spending ranges. This data shows that overall an Independent school has better performance than a Government school with a higher budget.