# Probabilistic Network for the grades of Portugiese students

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', 500)

## Preprocessing of the data
This part prepares the data for the model and learning process.

### Load and understand the dataset
The data are loaded and a histogram of the data is created to understand the data and how they correlate to each other.

In [10]:
DATA_PATH = "../data/"

original_data = pd.read_csv(DATA_PATH + "student-por_2.csv", sep=";")
original_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,Unnamed: 33
0,MS,M,16,R,GT3,T,1,1,at_home,other,other,father,2,1,0,no,no,no,yes,yes,yes,no,no,3,4,4,3,4,5,6,11,11,11,581558765
1,MS,F,18,R,GT3,T,2,2,other,other,other,mother,2,1,1,no,no,no,no,yes,no,yes,yes,5,5,5,1,1,3,0,8,6,0,677773943
2,MS,M,17,R,GT3,T,1,1,other,services,course,mother,2,1,0,no,yes,no,yes,no,yes,yes,yes,4,5,5,1,3,2,0,10,9,10,58860641
3,GP,M,18,U,LE3,T,1,1,other,other,course,mother,1,1,2,no,no,no,no,yes,no,yes,yes,2,3,5,2,5,4,0,11,9,0,627079796
4,GP,F,18,U,GT3,T,2,1,other,other,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,yes,4,2,5,1,2,1,8,14,14,15,459968853


In [11]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   school       599 non-null    object
 1   sex          599 non-null    object
 2   age          599 non-null    int64 
 3   address      599 non-null    object
 4   famsize      599 non-null    object
 5   Pstatus      599 non-null    object
 6   Medu         599 non-null    int64 
 7   Fedu         599 non-null    int64 
 8   Mjob         599 non-null    object
 9   Fjob         599 non-null    object
 10  reason       599 non-null    object
 11  guardian     599 non-null    object
 12  traveltime   599 non-null    int64 
 13  studytime    599 non-null    int64 
 14  failures     599 non-null    int64 
 15  schoolsup    599 non-null    object
 16  famsup       599 non-null    object
 17  paid         599 non-null    object
 18  activities   599 non-null    object
 19  nursery      599 non-null    

In [42]:
with open(DATA_PATH + "values_counts.txt", "w") as f:
    for col in original_data.columns:
        unique_values = original_data[col].sort_values().unique()
        value_counts = original_data[col].value_counts()
        f.write(f"{col}: {unique_values} \n{value_counts}\n")
        f.write("-"*50)
        f.write("\n\n")

In [28]:
original_data.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0
mean,16.752922,2.489149,2.287145,1.564274,1.944908,0.230384,3.949917,3.183639,3.168614,1.489149,2.258765,3.54591,3.597663,11.414023,11.562604,11.90985
std,1.234558,1.135973,1.093161,0.748773,0.823141,0.60129,0.945414,1.048809,1.169548,0.906756,1.27943,1.443755,4.590239,2.754541,2.922466,3.267129
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


## Create numerical scales


In [26]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

data = original_data.copy()

# Delete the columns that are not useful
del data['Unnamed: 33']

# Binary columns to be converted to 0/1
data['school'] = data['school'].map({'GP': 0, 'MS': 1})
data['sex'] = data['sex'].map({'F': 0, 'M': 1})
data['address'] = data['address'].map({'U': 0, 'R': 1})
data['famsize'] = data['famsize'].map({'LE3': 0, 'GT3': 1})
data['Pstatus'] = data['Pstatus'].map({'T': 0, 'A': 1})
data['schoolsup'] = data['schoolsup'].map({'yes': 1, 'no': 0})
data['famsup'] = data['famsup'].map({'yes': 1, 'no': 0})
data['paid'] = data['paid'].map({'yes': 1, 'no': 0})
data['activities'] = data['activities'].map({'yes': 1, 'no': 0})
data['nursery'] = data['nursery'].map({'yes': 1, 'no': 0})
data['higher'] = data['higher'].map({'yes': 1, 'no': 0})
data['internet'] = data['internet'].map({'yes': 1, 'no': 0})
data['romantic'] = data['romantic'].map({'yes': 1, 'no': 0})

# Categories to be converted to one-hot encoding
converter = make_column_transformer((OneHotEncoder(), [
    'Mjob',
    'Fjob',
    'reason',
    'guardian',
]), remainder='passthrough', verbose_feature_names_out=False)
converted = converter.fit_transform(data)
converted = pd.DataFrame(converted, columns=converter.get_feature_names_out())

# Remove label prefixes

converted.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 46 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Mjob_at_home       599 non-null    float64
 1   Mjob_health        599 non-null    float64
 2   Mjob_other         599 non-null    float64
 3   Mjob_services      599 non-null    float64
 4   Mjob_teacher       599 non-null    float64
 5   Fjob_at_home       599 non-null    float64
 6   Fjob_health        599 non-null    float64
 7   Fjob_other         599 non-null    float64
 8   Fjob_services      599 non-null    float64
 9   Fjob_teacher       599 non-null    float64
 10  reason_course      599 non-null    float64
 11  reason_home        599 non-null    float64
 12  reason_other       599 non-null    float64
 13  reason_reputation  599 non-null    float64
 14  guardian_father    599 non-null    float64
 15  guardian_mother    599 non-null    float64
 16  guardian_other     599 non

In [27]:
converted.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other,school,sex,age,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
Mjob_at_home,1.0,-0.146053,-0.413874,-0.269068,-0.185626,0.139385,-0.073671,0.009273,-0.014723,-0.086903,0.134687,-0.082434,0.065308,-0.127969,-0.01221,-0.011265,0.041217,0.220128,-0.142403,0.080722,0.170575,-0.022863,-0.041013,-0.378282,-0.211156,0.192918,-0.049678,0.107015,-0.020749,-0.010375,-0.014332,-0.071546,-0.014529,-0.147323,-0.241699,0.040575,-0.02006,-0.016846,-0.039894,-0.018582,-0.028318,-0.085877,-0.02574,-0.173009,-0.146101,-0.132064
Mjob_health,-0.146053,1.0,-0.224656,-0.146053,-0.10076,-0.049685,0.232633,-0.076196,0.023639,-0.009977,-0.110884,-0.012946,0.023542,0.1282,0.043511,-0.01295,-0.049685,-0.085286,0.016928,-0.094046,-0.093025,-0.005171,-0.025361,0.262038,0.136894,-0.12683,-0.004481,-0.054715,-0.055955,0.051247,0.0075,0.003291,0.024629,0.098235,0.095892,0.026423,-0.039266,-0.043235,0.036038,-0.088449,0.013087,0.013219,-0.093897,0.125051,0.127645,0.125394
Mjob_other,-0.413874,-0.224656,1.0,-0.413874,-0.285525,-0.035107,-0.110168,0.235844,-0.147493,-0.098181,-0.03347,0.072941,-0.051103,0.005407,0.067666,-0.1181,0.102068,0.033788,-0.019166,0.05155,0.046264,0.074968,0.02099,-0.247309,-0.206622,0.034439,-0.009008,0.007527,0.056592,-0.109376,-0.03403,-0.069441,-0.094267,-0.029905,-0.065813,0.029907,0.006052,-0.002575,0.01088,-0.011232,-0.043642,-0.012337,0.042368,-0.05035,-0.05522,-0.072376
Mjob_services,-0.269068,-0.146053,-0.413874,1.0,-0.185626,-0.024228,-0.005466,-0.13082,0.158025,0.003912,-0.037876,-0.023627,8.7e-05,0.069109,-0.02191,0.06917,-0.089673,-0.122646,0.065827,-0.035168,-0.102666,-0.022863,0.046905,0.1471,0.117914,-0.07459,0.034746,0.018636,0.032138,0.073605,0.053533,0.075502,0.037151,0.036524,0.143399,-0.069784,0.040474,0.002642,0.047486,0.058057,0.041973,0.058523,0.07133,0.046627,0.030142,0.041851
Mjob_teacher,-0.185626,-0.10076,-0.285525,-0.185626,1.0,-0.053564,0.079958,-0.143173,0.022588,0.26602,0.017888,0.035208,-0.025084,-0.037898,-0.09581,0.117536,-0.053564,-0.10739,0.114198,-0.060575,-0.082095,-0.052042,-0.018992,0.462438,0.325098,-0.101085,0.036768,-0.128467,-0.055629,0.044474,-0.004379,0.098915,0.095468,0.107931,0.148938,-0.030056,-0.003311,0.057814,-0.056139,0.039138,0.038713,0.043333,-0.046633,0.137378,0.129298,0.124342
Fjob_at_home,0.139385,-0.049685,-0.035107,-0.024228,-0.053564,1.0,-0.049716,-0.305448,-0.167009,-0.063549,0.109927,-0.047313,-0.030058,-0.061547,0.012462,-0.030928,0.035599,0.154949,-0.042188,0.053581,-0.021467,-0.037163,0.003948,-0.097606,-0.100938,-0.03192,-0.006475,0.030996,0.015721,0.046186,-0.013078,0.043254,0.029552,-0.100184,-0.119554,0.006149,-0.084931,0.048848,0.007185,-0.026322,-0.048916,-0.050236,0.02784,-0.096127,-0.079022,-0.047925
Fjob_health,-0.073671,0.232633,-0.110168,-0.005466,0.079958,-0.049716,1.0,-0.212221,-0.116036,-0.044153,-0.054496,-0.055172,0.023634,0.102965,0.074865,-0.041822,-0.049716,-0.077565,-0.038787,-0.105909,-0.085104,-0.005281,-0.011545,0.149148,0.231902,-0.115335,0.091541,-0.055802,0.086128,0.108518,-0.009086,0.002172,0.067565,0.064843,0.037409,-0.024699,1.6e-05,-0.06804,-0.010913,-0.049059,-0.04489,0.07783,-0.024219,0.083464,0.081941,0.070607
Fjob_other,0.009273,-0.076196,0.235844,-0.13082,-0.143173,-0.305448,-0.212221,1.0,-0.712901,-0.271267,-0.021054,0.074068,-0.079091,0.010528,-0.136789,0.071772,0.099338,-0.05853,-0.023302,0.053956,0.052108,-0.019787,0.085474,-0.127078,-0.220008,0.097748,-0.03409,0.038684,-0.013307,-0.061175,0.014833,-0.084343,-0.045608,-0.023973,-0.001604,0.011372,0.017751,0.028113,0.028362,-0.032796,-0.00717,0.079241,0.009836,-0.028582,-0.014676,-0.006644
Fjob_services,-0.014723,0.023639,-0.147493,0.158025,0.022588,-0.167009,-0.116036,-0.712901,1.0,-0.14832,-0.006615,-0.031949,0.077017,-0.018126,0.099689,-0.049415,-0.077689,0.05756,0.033401,-0.016542,0.004432,0.001975,-0.105063,0.005969,0.033206,-0.018866,0.01471,0.001828,-0.059548,-0.019045,0.009613,0.044462,0.014428,0.026649,0.048581,-0.016544,0.056706,-0.049121,-0.023302,0.081311,0.091652,-0.097155,-0.020587,-0.018304,-0.042657,-0.05334
Fjob_teacher,-0.086903,-0.009977,-0.098181,0.003912,0.26602,-0.063549,-0.044153,-0.271267,-0.14832,1.0,-0.018876,-0.002825,0.034951,-0.000927,0.028698,0.008271,-0.063549,-0.09603,0.062453,-0.060747,-0.031882,0.085141,0.026335,0.257361,0.345361,-0.050179,-0.020193,-0.078742,0.062026,0.034912,-0.030118,0.047323,-0.015086,0.059015,0.009319,0.020891,-0.058137,0.036297,-0.01522,-0.021737,-0.077125,0.013025,0.007891,0.139594,0.139824,0.117997
