# SUSTAINABLE EDUCATION


In [301]:
#importing the required Libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import r2_score
import math

In [302]:
#importing the dataframes

grades_region = pd.read_excel('grades.xlsx')
districts = pd.read_excel('districts.xlsx')
coastal = pd.read_excel('coastal.xlsx')



In [303]:
#Visualizing the raw data

In [304]:
grades_region.head()

Unnamed: 0,Contexto,Ranking,Escola,Concelho,Média
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91


In [305]:
print(grades_region.shape)

(1119, 5)


In [306]:
districts.head()

Unnamed: 0,District /,Name of municipality[a],Area (km2),Area rank[b],Pop. total (2011)[c],Pop. rank[b],PD (per km2 in 2011)[c],PD rank[b],Number of parishes[1],Ruling party 2017-2021[d][e]
0,Autonomous region,,,,,,,,,
1,Aveiro,Albergaria-a-Velha,155.4,196.0,25.497,99.0,164.0,83.0,6.0,CDS
2,Aveiro,Oliveira do Bairro,87.3,260.0,22.365,115.0,256.0,59.0,4.0,CDS
3,Aveiro,Vale de Cambra,146.5,204.0,24.761,103.0,169.0,80.0,7.0,CDS
4,Aveiro,Águeda,335.3,89.0,49.691,58.0,148.0,91.0,11.0,IND


In [307]:
districts.shape

(311, 10)

In [308]:
coastal.head()

Unnamed: 0,District,Geog
0,Lisboa,Coastal
1,Porto,Coastal
2,Braga,Coastal
3,Setúbal,Coastal
4,Aveiro,Coastal


In [309]:
coastal.shape

(18, 2)

# CLEANING THE DATA

# 1 - Dropping unnecessary columns and choosing better names




In [310]:
districts_2 = districts[['District /','Name of municipality[a]']]

districts_2 = districts_2.rename(columns={'District /': 'District','Name of municipality[a]': 'Municipality'})


districts_2


Unnamed: 0,District,Municipality
0,Autonomous region,
1,Aveiro,Albergaria-a-Velha
2,Aveiro,Oliveira do Bairro
3,Aveiro,Vale de Cambra
4,Aveiro,Águeda
...,...,...
306,Viseu,Viseu
307,Viseu,Vouzela
308,Viseu,Nelas
309,Viseu,Castro Daire


In [311]:
grades_region = grades_region.rename (columns={'Contexto':'Context','Escola':'School','Concelho':'Municipality','Média':'Grade'})

grades_region

Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,-,Colégio São Filipe,Setúbal,2.30
1115,Privado,-,Colégio D. Filipa,Amadora,2.25
1116,Privado,-,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,-,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [312]:
coastal = coastal.rename (columns={'Geog':'Region'})

coastal

Unnamed: 0,District,Region
0,Lisboa,Coastal
1,Porto,Coastal
2,Braga,Coastal
3,Setúbal,Coastal
4,Aveiro,Coastal
5,Leiria,Coastal
6,Santarém,Coastal
7,Coimbra,Coastal
8,Faro,Coastal
9,Viseu,Non-Coastal


# 2 - Cleanning the NAN

In [313]:
# 2.1 - checking the % of NAN on each column
# 2.1.1 - dataset: districts_2

In [314]:
districts_2.shape



(311, 2)

In [315]:
round(districts_2.isna().sum()/len(districts_2),4)*100



District        0.64
Municipality    0.96
dtype: float64

In [316]:
# very few NAN. we have more than 300 rows, and the %NAN is similar for both columns  (likely top be related)
# so is worth to get rid of those lines with NAN

In [317]:
districts_2 = districts_2.dropna()

districts_2


Unnamed: 0,District,Municipality
1,Aveiro,Albergaria-a-Velha
2,Aveiro,Oliveira do Bairro
3,Aveiro,Vale de Cambra
4,Aveiro,Águeda
5,Aveiro,Anadia
...,...,...
306,Viseu,Viseu
307,Viseu,Vouzela
308,Viseu,Nelas
309,Viseu,Castro Daire


In [318]:
round(districts_2.isna().sum()/len(districts_2),4)*100   # all NAN cleaned  !

District        0.0
Municipality    0.0
dtype: float64

In [319]:
# 2.1.2 - dataset: grades_region

In [320]:
grades_region.shape

(1119, 5)

In [321]:
round(grades_region.isna().sum()/len(grades_region),4)*100

Context         28.87
Ranking          0.27
School           0.00
Municipality     0.00
Grade            0.00
dtype: float64

In [322]:
# For the column "Context" we have a significant % of NAN (28,87%).  

In [323]:
grades_region['Context'].value_counts(dropna=False)

NaN             323
Favorável       313
Desfavorável    297
Privado         186
Name: Context, dtype: int64

In [324]:
# The NAN are related with situations where the conext is "Normal" (neither Favourable or unfavourable. 
# This way, we can replace the NAN by "Normal"

In [325]:
grades_region['Context']=grades_region['Context'].fillna('Normal')

grades_region

Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,-,Colégio São Filipe,Setúbal,2.30
1115,Privado,-,Colégio D. Filipa,Amadora,2.25
1116,Privado,-,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,-,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [326]:
grades_region['Context'].value_counts(dropna=False)

Normal          323
Favorável       313
Desfavorável    297
Privado         186
Name: Context, dtype: int64

In [327]:
# About the column ranking: we have more than 1000 rows, so we can drop the rows whith NAN in the raking column (only 0,27%)

grades_region = grades_region.dropna()

grades_region

Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16.º,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32.º,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37.º,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38.º,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40.º,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,-,Colégio São Filipe,Setúbal,2.30
1115,Privado,-,Colégio D. Filipa,Amadora,2.25
1116,Privado,-,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,-,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [328]:
round(grades_region.isna().sum()/len(grades_region),4)*100   # all NAN cleaned  !

Context         0.0
Ranking         0.0
School          0.0
Municipality    0.0
Grade           0.0
dtype: float64

In [329]:
# 3 - removing the strings ".º" and "-"

In [330]:
grades_region['Ranking']=grades_region['Ranking'].str.replace('.º','') 
grades_region['Ranking']=grades_region['Ranking'].str.replace('-','') 
grades_region['Ranking']=grades_region['Ranking'].str.replace(' ','') 
grades_region

  grades_region['Ranking']=grades_region['Ranking'].str.replace('.º','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grades_region['Ranking']=grades_region['Ranking'].str.replace('.º','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grades_region['Ranking']=grades_region['Ranking'].str.replace('-','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,,Colégio São Filipe,Setúbal,2.30
1115,Privado,,Colégio D. Filipa,Amadora,2.25
1116,Privado,,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [331]:
# ensuring that the column Ranking becomes integer

grades_region['Ranking'].value_counts(dropna=False)

       142
16       1
421      1
379      1
380      1
      ... 
321      1
324      1
338      1
339      1
962      1
Name: Ranking, Length: 975, dtype: int64

In [332]:
grades_region.dtypes

Context          object
Ranking          object
School           object
Municipality     object
Grade           float64
dtype: object

In [333]:
# converting the Ranking into an integer


pd.to_numeric(grades_region['Ranking'], errors='coerce')
#grades_region['Ranking']=grades_region['Ranking'].fillna('Not_Available')

grades_region


Unnamed: 0,Context,Ranking,School,Municipality,Grade
0,Favorável,16,Escola Artística de Música do Conservatório Na...,Lisboa,4.07
1,Favorável,32,Escola Artística do Conservatório de Música Ca...,Braga,3.94
2,Favorável,37,Escola Básica Vasco da Gama,Lisboa,3.92
3,Favorável,38,Escola Básica e Secundária da Quinta das Flores,Coimbra,3.92
4,Favorável,40,Escola Secundária Infanta D. Maria,Coimbra,3.91
...,...,...,...,...,...
1114,Privado,,Colégio São Filipe,Setúbal,2.30
1115,Privado,,Colégio D. Filipa,Amadora,2.25
1116,Privado,,Instituto Jacob Rodrigues Pereira,Lisboa,2.25
1117,Privado,,Colégio Maria Pia (Casa Pia de Lisboa),Lisboa,2.19


In [334]:
grades_region['Ranking'].value_counts(dropna=False)

       142
16       1
421      1
379      1
380      1
      ... 
321      1
324      1
338      1
339      1
962      1
Name: Ranking, Length: 975, dtype: int64

# EXPORTING THE CLEANED DATA TO A CSV (FROM WHERE WILL BE UPLOADED TO AN SQL DATABASE)

In [335]:
#   !!!  put school, Municipality and District on the first column of each table as a Serial

In [336]:
# Questions:

# Does economic health of a family impact the children´s school grades?
# Children from non-costal regions have lower grades?
# Is there a relationship between a school´s ranking and the average grades of its students?

