# SUSTAINABLE EDUCATION


In [578]:
#importing the required Libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import r2_score
import math


import pymysql
from sqlalchemy import create_engine

from getpass import getpass

In [579]:
#importing the dataframes

grades_region = pd.read_excel('grades.xlsx')
districts = pd.read_excel('districts.xlsx')
coastal = pd.read_excel('coastal.xlsx')



In [580]:
#Visualizing the raw data

In [581]:
grades_region.head()

Unnamed: 0,Contexto,Ranking,Escola,Concelho,Média
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91


In [582]:
print(grades_region.shape)

(1119, 5)


In [583]:
districts.head()

Unnamed: 0,District /,Name of municipality[a],Area (km2),Area rank[b],Pop. total (2011)[c],Pop. rank[b],PD (per km2 in 2011)[c],PD rank[b],Number of parishes[1],Ruling party 2017-2021[d][e]
0,Autonomous region,,,,,,,,,
1,Aveiro,Albergaria-a-Velha,155.4,196.0,25.497,99.0,164.0,83.0,6.0,CDS
2,Aveiro,Oliveira do Bairro,87.3,260.0,22.365,115.0,256.0,59.0,4.0,CDS
3,Aveiro,Vale de Cambra,146.5,204.0,24.761,103.0,169.0,80.0,7.0,CDS
4,Aveiro,Águeda,335.3,89.0,49.691,58.0,148.0,91.0,11.0,IND


In [584]:
districts.shape

(311, 10)

In [585]:
coastal.head()

Unnamed: 0,District,Geog
0,Lisboa,Coastal
1,Porto,Coastal
2,Braga,Coastal
3,Setúbal,Coastal
4,Aveiro,Coastal


In [586]:
coastal.shape

(18, 2)

# CLEANING THE DATA

# 1 - Dropping unnecessary columns and choosing better names




In [587]:
districts_2 = districts[['District /','Name of municipality[a]']]

districts_2 = districts_2.rename(columns={'District /': 'District','Name of municipality[a]': 'Municipality'})


districts_2


Unnamed: 0,District,Municipality
0,Autonomous region,
1,Aveiro,Albergaria-a-Velha
2,Aveiro,Oliveira do Bairro
3,Aveiro,Vale de Cambra
4,Aveiro,Águeda
...,...,...
306,Viseu,Viseu
307,Viseu,Vouzela
308,Viseu,Nelas
309,Viseu,Castro Daire


In [588]:
grades_region = grades_region.rename (columns={'Contexto':'Context','Escola':'School','Concelho':'Municipality','Média':'Grade'})

grades_region

Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,-,Colégio São Filipe,Setúbal,2.30
1115,Privado,-,Colégio D. Filipa,Amadora,2.25
1116,Privado,-,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,-,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [589]:
coastal = coastal.rename (columns={'Geog':'Region'})

coastal

Unnamed: 0,District,Region
0,Lisboa,Coastal
1,Porto,Coastal
2,Braga,Coastal
3,Setúbal,Coastal
4,Aveiro,Coastal
5,Leiria,Coastal
6,Santarém,Coastal
7,Coimbra,Coastal
8,Faro,Coastal
9,Viseu,Non-Coastal


# 2 - Cleanning the NAN

In [590]:
# 2.1 - checking the % of NAN on each column
# 2.1.1 - dataset: districts_2

In [591]:
districts_2.shape



(311, 2)

In [592]:
round(districts_2.isna().sum()/len(districts_2),4)*100



District        0.64
Municipality    0.96
dtype: float64

In [593]:
# very few NAN. we have more than 300 rows, and the %NAN is similar for both columns  (likely top be related)
# so is worth to get rid of those lines with NAN

In [594]:
districts_2 = districts_2.dropna()

districts_2


Unnamed: 0,District,Municipality
1,Aveiro,Albergaria-a-Velha
2,Aveiro,Oliveira do Bairro
3,Aveiro,Vale de Cambra
4,Aveiro,Águeda
5,Aveiro,Anadia
...,...,...
306,Viseu,Viseu
307,Viseu,Vouzela
308,Viseu,Nelas
309,Viseu,Castro Daire


In [595]:
round(districts_2.isna().sum()/len(districts_2),4)*100   # all NAN cleaned  !

District        0.0
Municipality    0.0
dtype: float64

In [596]:
# 2.1.2 - dataset: grades_region

In [597]:
grades_region.shape

(1119, 5)

In [598]:
round(grades_region.isna().sum()/len(grades_region),4)*100

Context         28.87
Ranking          0.27
School           0.00
Municipality     0.00
Grade            0.00
dtype: float64

In [599]:
# For the column "Context" we have a significant % of NAN (28,87%).  

In [600]:
grades_region['Context'].value_counts(dropna=False)

NaN             323
Favorável       313
Desfavorável    297
Privado         186
Name: Context, dtype: int64

In [601]:
# The NAN are related with situations where the conext is "Normal" (neither Favourable or unfavourable. 
# This way, we can replace the NAN by "Normal"

In [602]:
grades_region['Context']=grades_region['Context'].fillna('Normal')

grades_region

Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,-,Colégio São Filipe,Setúbal,2.30
1115,Privado,-,Colégio D. Filipa,Amadora,2.25
1116,Privado,-,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,-,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [603]:
grades_region['Context'].value_counts(dropna=False)

Normal          323
Favorável       313
Desfavorável    297
Privado         186
Name: Context, dtype: int64

In [604]:
# About the column ranking: we have more than 1000 rows, so we can drop the rows whith NAN in the raking column (only 0,27%)

grades_region = grades_region.dropna()

grades_region

Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,-,Colégio São Filipe,Setúbal,2.30
1115,Privado,-,Colégio D. Filipa,Amadora,2.25
1116,Privado,-,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,-,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [605]:
round(grades_region.isna().sum()/len(grades_region),4)*100   # all NAN cleaned  !

Context         0.0
Ranking         0.0
School          0.0
Municipality    0.0
Grade           0.0
dtype: float64

In [606]:
# 3 - removing the strings ".º" and "-"

In [607]:
grades_region['Ranking']=grades_region['Ranking'].str.replace('.º','') 
#grades_region['Ranking']=grades_region['Ranking'].str.replace('-','') 
grades_region['Ranking']=grades_region['Ranking'].str.replace(' ','-') 
grades_region

  grades_region['Ranking']=grades_region['Ranking'].str.replace('.º','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grades_region['Ranking']=grades_region['Ranking'].str.replace('.º','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grades_region['Ranking']=grades_region['Ranking'].str.replace(' ','-')


Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,-,Colégio São Filipe,Setúbal,2.30
1115,Privado,-,Colégio D. Filipa,Amadora,2.25
1116,Privado,-,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,-,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


# Exporting the tables to SQL and CSV


In [608]:
# 1) Choosing the best order for the columns before exporting them (first column = key)

In [609]:
districts_2 = districts_2[['Municipality','District']]
districts_2

Unnamed: 0,Municipality,District
1,Albergaria-a-Velha,Aveiro
2,Oliveira do Bairro,Aveiro
3,Vale de Cambra,Aveiro
4,Águeda,Aveiro
5,Anadia,Aveiro
...,...,...
306,Viseu,Viseu
307,Vouzela,Viseu
308,Nelas,Viseu
309,Castro Daire,Viseu


In [610]:
grades_region = grades_region[['School','Context','Municipality','Grade']]
grades_region

Unnamed: 0,School,Context,Municipality,Grade
0,Escola Artística de Música do Conservatório Na...,Favorável,Lisboa,4.07
1,Escola Artística do Conservatório de Música Ca...,Favorável,Braga,3.94
2,Escola Básica Vasco da Gama,Favorável,Lisboa,3.92
3,Escola Básica e Secundária da Quinta das Flores,Favorável,Coimbra,3.92
4,Escola Secundária Infanta D. Maria,Favorável,Coimbra,3.91
...,...,...,...,...
1114,Colégio São Filipe,Privado,Setúbal,2.30
1115,Colégio D. Filipa,Privado,Amadora,2.25
1116,Instituto Jacob Rodrigues Pereira,Privado,Lisboa,2.25
1117,Colégio Maria Pia (Casa Pia de Lisboa),Privado,Lisboa,2.19


In [611]:
coastal

Unnamed: 0,District,Region
0,Lisboa,Coastal
1,Porto,Coastal
2,Braga,Coastal
3,Setúbal,Coastal
4,Aveiro,Coastal
5,Leiria,Coastal
6,Santarém,Coastal
7,Coimbra,Coastal
8,Faro,Coastal
9,Viseu,Non-Coastal


In [612]:
# exporting the data to a SQL database  

In [613]:
password=getpass()    # safe password to access the database

········


In [614]:
# creatimg the database "Educação" in the SQL server

connection_string = 'mysql+pymysql://root:'+password+'@localhost/educacao'
engine=create_engine(connection_string)


In [615]:

# creating the tables in the database Educação: districts_2 , grades_region , coastal

districts_2.to_sql(con=engine, name='municipality', if_exists='replace', index=False) 
grades_region.to_sql(con=engine, name='school', if_exists='replace', index=False) 
coastal.to_sql(con=engine, name='districts', if_exists='replace', index=False) 


In [616]:
# creating a backup of the tables in a csv file

districts_2.to_csv('municipality.csv', index=False)
grades_region.to_csv('school.csv', index=False)
coastal.to_csv('districts.csv', index=False)


# IMPORTING A SUMMARY TABLE FROM AN SQL DATABASE



In [617]:
query= 'SELECT * FROM educacao.school JOIN educacao.municipality USING (Municipality) JOIN educacao.districts USING (District);'

cleaned_data = pd.read_sql_query(query,engine)
cleaned_data

Unnamed: 0,District,Municipality,School,Context,Grade,Region
0,Lisboa,Lisboa,Escola Artística de Música do Conservatório Na...,Favorável,4.07,Coastal
1,Braga,Braga,Escola Artística do Conservatório de Música Ca...,Favorável,3.94,Coastal
2,Lisboa,Lisboa,Escola Básica Vasco da Gama,Favorável,3.92,Coastal
3,Coimbra,Coimbra,Escola Básica e Secundária da Quinta das Flores,Favorável,3.92,Coastal
4,Coimbra,Coimbra,Escola Secundária Infanta D. Maria,Favorável,3.91,Coastal
...,...,...,...,...,...,...
1106,Setúbal,Setúbal,Colégio São Filipe,Privado,2.30,Coastal
1107,Lisboa,Amadora,Colégio D. Filipa,Privado,2.25,Coastal
1108,Lisboa,Lisboa,Instituto Jacob Rodrigues Pereira,Privado,2.25,Coastal
1109,Lisboa,Lisboa,Colégio Maria Pia (Casa Pia de Lisboa),Privado,2.19,Coastal


# AGENDA TO ADDRESS 

In [618]:
# I - Does economic health of a family impact the children´s school grades?
# II - Is it true that portuguese children from non-costal regions have lower grades?
# III - Is there a strong relationship between a school´s ranking and the average grades of its students?

In [619]:
# reviewing the content of column "Context" 

cleaned_data['Context'].value_counts(dropna=False)

Normal          322
Favorável       313
Desfavorável    297
Privado         179
Name: Context, dtype: int64

In [620]:
# Calculating the average grade of the schools located in privileged locations

In [621]:
avg_grade_priv = (cleaned_data[cleaned_data['Context']=='Favorável'].agg({'Grade':sum})       \
+cleaned_data[cleaned_data['Context']=='Privado'].agg({'Grade':sum}))/       \
(cleaned_data[cleaned_data['Context']=='Favorável'].agg({'Grade':'count'})       \
+cleaned_data[cleaned_data['Context']=='Privado'].agg({'Grade':'count'}))


print("Average grade of the schools located in privileged locations :", avg_grade_priv)

Average grade of the schools located in privileged locations : Grade    3.254593
dtype: float64


In [622]:
# Calculating the average grade of the schools located in non-privileged locations

avg_grade_npriv = (cleaned_data[cleaned_data['Context']=='Desfavorável'].agg({'Grade':sum}) ) /       \
(cleaned_data[cleaned_data['Context']=='Desfavorável'].agg({'Grade':'count'}))


print("Average grade of the schools located in non-privileged locations :", avg_grade_npriv)

Average grade of the schools located in non-privileged locations : Grade    2.785993
dtype: float64


In [623]:
# I - CONCLUSION : 
# Yes, we may conclude that the economic health of a family impacts children’s progress in school.
# The schools located in geographies with privileged infrastructures and purchasing power have a positive  average grade, 
# but schools located in relatively poor places have negative average grade (below 3).

In [624]:
# reviewing the content of column "Region" 

cleaned_data['Region'].value_counts(dropna=False)

Coastal        878
Non-Coastal    233
Name: Region, dtype: int64

In [625]:
# Calculating the average grade of the schools located in Coastal regions


avg_grade_coastal = (cleaned_data[cleaned_data['Region']=='Coastal'].agg({'Grade':sum}) ) /       \
(cleaned_data[cleaned_data['Region']=='Coastal'].agg({'Grade':'count'}))


print("Average grade of the schools located in Coastal Regions :", avg_grade_coastal)

Average grade of the schools located in Coastal Regions : Grade    3.03779
dtype: float64


In [626]:
# Calculating the average grade of the schools located in Non-coastal regions


avg_grade_ncoastal = (cleaned_data[cleaned_data['Region']=='Non-Coastal'].agg({'Grade':sum}) ) /       \
(cleaned_data[cleaned_data['Region']=='Non-Coastal'].agg({'Grade':'count'}))


print("Average grade of the schools located in Non-Coastal Regions :", avg_grade_ncoastal)

Average grade of the schools located in Non-Coastal Regions : Grade    3.003734
dtype: float64


In [627]:
# Comment: The average grade is slightly higher within the Coastal regions, but we must also check where are located the 
# Districts with negative average grades, as that represents regions with lack of Governmental support


In [628]:

cleaned_data

Unnamed: 0,District,Municipality,School,Context,Grade,Region
0,Lisboa,Lisboa,Escola Artística de Música do Conservatório Na...,Favorável,4.07,Coastal
1,Braga,Braga,Escola Artística do Conservatório de Música Ca...,Favorável,3.94,Coastal
2,Lisboa,Lisboa,Escola Básica Vasco da Gama,Favorável,3.92,Coastal
3,Coimbra,Coimbra,Escola Básica e Secundária da Quinta das Flores,Favorável,3.92,Coastal
4,Coimbra,Coimbra,Escola Secundária Infanta D. Maria,Favorável,3.91,Coastal
...,...,...,...,...,...,...
1106,Setúbal,Setúbal,Colégio São Filipe,Privado,2.30,Coastal
1107,Lisboa,Amadora,Colégio D. Filipa,Privado,2.25,Coastal
1108,Lisboa,Lisboa,Instituto Jacob Rodrigues Pereira,Privado,2.25,Coastal
1109,Lisboa,Lisboa,Colégio Maria Pia (Casa Pia de Lisboa),Privado,2.19,Coastal


In [629]:
# Grouping by District and excluding the islands

cleaned_data2 = cleaned_data.groupby(['District','Region']).agg({'Grade':np.mean})

cleaned_data2


Unnamed: 0_level_0,Unnamed: 1_level_0,Grade
District,Region,Unnamed: 2_level_1
Aveiro,Coastal,3.125211
Beja,Non-Coastal,2.812593
Braga,Coastal,3.1225
Bragança,Non-Coastal,2.90375
Castelo Branco,Non-Coastal,3.045
Coimbra,Coastal,3.165179
Faro,Coastal,2.909474
Guarda,Non-Coastal,3.087826
Leiria,Coastal,3.054815
Lisboa,Coastal,3.026147


In [630]:
# how much % of the Districts having negative average grade are located in Non-Coastal regions?

In [631]:
# Total Districts with negative average grades (<3)

cleaned_data3=cleaned_data2[cleaned_data2['Grade']<3]
cleaned_data3

Unnamed: 0_level_0,Unnamed: 1_level_0,Grade
District,Region,Unnamed: 2_level_1
Beja,Non-Coastal,2.812593
Bragança,Non-Coastal,2.90375
Faro,Coastal,2.909474
Portalegre,Non-Coastal,2.81381
Setúbal,Coastal,2.792976
Évora,Non-Coastal,2.95087


In [632]:
nr_D_neg=len(cleaned_data3)
nr_D_neg

6

In [633]:
# Total Non-Coastel Districts with negative average grades (<3)

cleaned_data4 = cleaned_data2[(cleaned_data2['Grade']<3) & (cleaned_data2['Region'].isin(['Non-Coastal']))]
cleaned_data4

KeyError: 'Region'

In [None]:


# cleaned_data.to_excel('cleaned_data_rascunho.xlsx', index=False



# ensuring that the column Ranking becomes integer

# grades_region['Ranking'].value_counts(dropna=False)




# grades_region.dtypes


# converting the Ranking into an integer


# pd.to_numeric(grades_region['Ranking'], errors='coerce')
#grades_region['Ranking']=grades_region['Ranking'].fillna('NA')

# grades_region





# grades_region['Ranking'].value_counts(dropna=False)



