# Probabilistic Network for the grades of Portugiese students

In [1]:
!python -m venv ../.env
!source ../.env/bin/activate
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import dataframe_image as dfi
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', 500)

## Preprocessing of the data
This part prepares the data for the model and learning process.

### Load and understand the dataset
The data are loaded and a histogram of the data is created to understand the data and how they correlate to each other.

In [9]:
DATA_PATH = "../data/"
DOC_PATH = "../doc/"

original_data = pd.read_csv(DATA_PATH + "student-por_2.csv", sep=";")
original_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,Unnamed: 33
0,MS,M,16,R,GT3,T,1,1,at_home,other,other,father,2,1,0,no,no,no,yes,yes,yes,no,no,3,4,4,3,4,5,6,11,11,11,581558765
1,MS,F,18,R,GT3,T,2,2,other,other,other,mother,2,1,1,no,no,no,no,yes,no,yes,yes,5,5,5,1,1,3,0,8,6,0,677773943
2,MS,M,17,R,GT3,T,1,1,other,services,course,mother,2,1,0,no,yes,no,yes,no,yes,yes,yes,4,5,5,1,3,2,0,10,9,10,58860641
3,GP,M,18,U,LE3,T,1,1,other,other,course,mother,1,1,2,no,no,no,no,yes,no,yes,yes,2,3,5,2,5,4,0,11,9,0,627079796
4,GP,F,18,U,GT3,T,2,1,other,other,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,yes,4,2,5,1,2,1,8,14,14,15,459968853


In [4]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   school       599 non-null    object
 1   sex          599 non-null    object
 2   age          599 non-null    int64 
 3   address      599 non-null    object
 4   famsize      599 non-null    object
 5   Pstatus      599 non-null    object
 6   Medu         599 non-null    int64 
 7   Fedu         599 non-null    int64 
 8   Mjob         599 non-null    object
 9   Fjob         599 non-null    object
 10  reason       599 non-null    object
 11  guardian     599 non-null    object
 12  traveltime   599 non-null    int64 
 13  studytime    599 non-null    int64 
 14  failures     599 non-null    int64 
 15  schoolsup    599 non-null    object
 16  famsup       599 non-null    object
 17  paid         599 non-null    object
 18  activities   599 non-null    object
 19  nursery      599 non-null    

### Check for false values and counts

In [5]:
with open(DATA_PATH + "values_counts.txt", "w") as f:
    for col in original_data.columns:
        unique_values = original_data[col].sort_values().unique()
        value_counts = original_data[col].value_counts()
        f.write(f"{col}: {unique_values} \n{value_counts}\n")
        f.write("-"*50)
        f.write("\n\n")

In [6]:
original_data.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0,599.0
mean,16.752922,2.489149,2.287145,1.564274,1.944908,0.230384,3.949917,3.183639,3.168614,1.489149,2.258765,3.54591,3.597663,11.414023,11.562604,11.90985
std,1.234558,1.135973,1.093161,0.748773,0.823141,0.60129,0.945414,1.048809,1.169548,0.906756,1.27943,1.443755,4.590239,2.754541,2.922466,3.267129
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


## Create numerical scales


In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

data = original_data.copy()

# Delete the columns that are not useful
del data['Unnamed: 33']

# Binary columns to be converted to 0/1
data['school'] = data['school'].map({'GP': 0, 'MS': 1})
data['sex'] = data['sex'].map({'F': 0, 'M': 1})
data['address'] = data['address'].map({'U': 0, 'R': 1})
data['famsize'] = data['famsize'].map({'LE3': 0, 'GT3': 1})
data['Pstatus'] = data['Pstatus'].map({'T': 0, 'A': 1})
data['schoolsup'] = data['schoolsup'].map({'yes': 1, 'no': 0})
data['famsup'] = data['famsup'].map({'yes': 1, 'no': 0})
data['paid'] = data['paid'].map({'yes': 1, 'no': 0})
data['activities'] = data['activities'].map({'yes': 1, 'no': 0})
data['nursery'] = data['nursery'].map({'yes': 1, 'no': 0})
data['higher'] = data['higher'].map({'yes': 1, 'no': 0})
data['internet'] = data['internet'].map({'yes': 1, 'no': 0})
data['romantic'] = data['romantic'].map({'yes': 1, 'no': 0})

# Categories to be converted to one-hot encoding
converter = make_column_transformer((OneHotEncoder(), [
    # 'school',
    # 'sex',
    # 'address',
    'Mjob',
    'Fjob',
    'reason',
    'guardian',
    # 'traveltime',
    # 'studytime',
]), remainder='passthrough', verbose_feature_names_out=False)
converted = converter.fit_transform(data)
converted = pd.DataFrame(converted, columns=converter.get_feature_names_out())

# Remove label prefixes

converted.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 46 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Mjob_at_home       599 non-null    float64
 1   Mjob_health        599 non-null    float64
 2   Mjob_other         599 non-null    float64
 3   Mjob_services      599 non-null    float64
 4   Mjob_teacher       599 non-null    float64
 5   Fjob_at_home       599 non-null    float64
 6   Fjob_health        599 non-null    float64
 7   Fjob_other         599 non-null    float64
 8   Fjob_services      599 non-null    float64
 9   Fjob_teacher       599 non-null    float64
 10  reason_course      599 non-null    float64
 11  reason_home        599 non-null    float64
 12  reason_other       599 non-null    float64
 13  reason_reputation  599 non-null    float64
 14  guardian_father    599 non-null    float64
 15  guardian_mother    599 non-null    float64
 16  guardian_other     599 non

In [11]:
corr_styled = converted.corr().style.background_gradient(cmap='RdYlGn', axis=None).set_precision(2)
# dfi.export(corr_styled, DOC_PATH + "correlation_matrix.png", max_cols=-1)
# corr_styled.to_excel(DOC_PATH + "correlation_matrix.xlsx")

  corr_styled.set_precision(2).to_excel(DOC_PATH + "correlation_matrix.xlsx")


## Create Network

In [13]:
from pgmpy.models import BayesianNetwork

model_data = original_data.copy()
del model_data['Unnamed: 33']

# Create Network
network = BayesianNetwork()

# Add all columns as nodes
for col in model_data.columns:
    network.add_node(col)

# network.add_node("situation")
    
# Add edges
# network.add_edge("situation", "G1")
# network.add_edge("situation", "G2")
network.add_edge("G1", "G3")
network.add_edge("G2", "G3")

# network.add_edge("Walc", "situation")
# network.add_edge("Dalc", "situation")
# network.add_edge("school", "situation")
# network.add_edge("studytime", "situation")
# network.add_edge("higher", "situation")
# network.add_edge("failures", "situation")
# network.add_edge("Fedu", "situation")
# network.add_edge("Medu", "situation")

network.add_edge("Walc", "G1")
network.add_edge("Dalc", "G1")
network.add_edge("school", "G1")
network.add_edge("studytime", "G1")
network.add_edge("higher", "G1")
network.add_edge("failures", "G1")
network.add_edge("Fedu", "G1")
network.add_edge("Medu", "G1")
network.add_edge("Walc", "G2")
network.add_edge("Dalc", "G2")
network.add_edge("school", "G2")
network.add_edge("studytime", "G2")
network.add_edge("higher", "G2")
network.add_edge("failures", "G2")
network.add_edge("Fedu", "G2")
network.add_edge("Medu", "G2")



print(network.nodes)
print(model_data.columns)
print(network.edges)

# Fit data
network.fit(model_data)



['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']
Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')
[('school', 'G1'), ('school', 'G2'), ('Medu', 'G1'), ('Medu', 'G2'), ('Fedu', 'G1'), ('Fedu', 'G2'), ('studytime', 'G1'), ('studytime', 'G2'), ('failures', 'G1'), ('failures', 'G2'), ('higher', 'G1'), ('higher', 'G2'), ('Dalc', 'G1'), ('Dalc', 'G2'), ('Walc', 'G1'), ('Walc', 'G2'), ('G1

In [14]:
test_data = pd.read_csv(DATA_PATH + "student-por_3.csv", sep=";")
# test_data = test_data.loc[[i for i in range(0, 100)]]

del test_data['Unnamed: 33']

label = test_data['G3']
del test_data['G3']
del test_data['G2']

# Predict
results = network.predict(test_data, stochastic=False)

for i in range(len(results)):
    print(f"Predicted: {results.loc[i, 'G3']}, Actual: {label[i]}")

  0%|          | 0/599 [00:00<?, ?it/s]

Predicted: 11, Actual: 10
Predicted: 11, Actual: 11
Predicted: 13, Actual: 10
Predicted: 11, Actual: 11
Predicted: 11, Actual: 12
Predicted: 11, Actual: 12
Predicted: 11, Actual: 11
Predicted: 8, Actual: 8
Predicted: 13, Actual: 14
Predicted: 10, Actual: 10
Predicted: 7, Actual: 7
Predicted: 13, Actual: 15
Predicted: 11, Actual: 11
Predicted: 7, Actual: 7
Predicted: 0, Actual: 0
Predicted: 16, Actual: 17
Predicted: 0, Actual: 10
Predicted: 6, Actual: 8
Predicted: 17, Actual: 15
Predicted: 13, Actual: 11
Predicted: 10, Actual: 9
Predicted: 8, Actual: 8
Predicted: 13, Actual: 13
Predicted: 15, Actual: 14
Predicted: 11, Actual: 11
Predicted: 16, Actual: 16
Predicted: 10, Actual: 10
Predicted: 11, Actual: 14
Predicted: 13, Actual: 14
Predicted: 13, Actual: 13
Predicted: 10, Actual: 10
Predicted: 13, Actual: 15
Predicted: 9, Actual: 9
Predicted: 11, Actual: 12
Predicted: 11, Actual: 13
Predicted: 15, Actual: 15
Predicted: 11, Actual: 12
Predicted: 12, Actual: 15
Predicted: 11, Actual: 10
Pr

## Evaluation

In [21]:
MATCH_LESS_EQUAL = 0

def distance(a, b):
    return abs(a - b)

def accuracy(results, label):
    correct = 0
    for i in range(len(results)):
        if distance(results.loc[i, 'G3'], label[i]) <= MATCH_LESS_EQUAL:
            correct += 1
    return correct / len(results)

def average_error(results, label):
    error = 0
    for i in range(len(results)):
        error += distance(results.loc[i, 'G3'], label[i])
    return error / len(results)

def average_error_without_match(results, label):
    error = 0
    count = 0
    for i in range(len(results)):
        if distance(results.loc[i, 'G3'], label[i]) > MATCH_LESS_EQUAL:
            error += distance(results.loc[i, 'G3'], label[i])
            count += 1
    return error / count

def loss(results, label):
    error = 0
    for i in range(len(results)):
        error += distance(results.loc[i, 'G3'], label[i]) ** 2
    return error / len(results)

ac = accuracy(results, label)
av_err = average_error(results, label)
av_err_without = average_error_without_match(results, label)
print(f"Accuracy: {ac}, Average Error: {av_err}, Average Error Without Match: {av_err_without}, Loss: {loss(results, label)}")

Accuracy: 0.44908180300500833, Average Error: 1.2303839732888147, Average Error Without Match: 2.2333333333333334, Loss: 5.828046744574291
