# Aprendizagem Automática Avançada
## Assignment 5
### Luís Ferreirinha Nº51127
### Christopher Anaya Nª60566

# Problem 1

In [1]:
from pomegranate import *

We will start by creating the bayesian network provided in the assignment.

In [2]:

# Define discrete and conditional probabilities for each node
node_a = DiscreteDistribution({'T': 0.3, 'F':0.7})
node_b = ConditionalProbabilityTable(
          [['F','T',0.4],
           ['F','F',0.6],
           ['T','T',0.8],
           ['T','F',0.2]], [node_a])
node_c = ConditionalProbabilityTable(
      [['F','F','T', 0.1],
       ['F','F','F', 0.9],
       ['F','T','T', 0.7],
       ['F','T','F', 0.3],
       ['T','F','T', 0.5],
       ['T','F','F', 0.5],
       ['T','T','T', 0.99],
       ['T','T','F', 0.01]], [node_a, node_b])
node_d = ConditionalProbabilityTable(
    [['F','T', 0.55],
     ['F','F', 0.45],
     ['T','T', 0.2],
     ['T','F', 0.8]],[node_b])


# Create the nodes
s_a = Node(node_a, name="a")
s_b = Node(node_b, name="b")
s_c = Node(node_c, name="c")
s_d = Node(node_d, name="d")

# Link the nodes to create the network
model = BayesianNetwork("Problem 1")
model.add_states(s_a, s_b, s_c, s_d)
model.add_edge(s_a, s_b)
model.add_edge(s_a, s_c)
model.add_edge(s_b, s_c)
model.add_edge(s_b, s_d)

# Create model
model.bake()

With the model created, we can now query it in order to determine certain probabilities

a) $P(A = T | C = T, D = T)$

In [25]:
#This returns a list with probabilities for the other nodes, but we can acess the parameters of A like this
print(model.predict_proba([[None, None, 'T', 'T']])[0][0].parameters)

[{'T': 0.5054138717420109, 'F': 0.49458612825798914}]


b) $P(A = T | D = F)$

In [18]:
print(model.predict_proba([[None, None, None, 'F']])[0][0].parameters)

[{'T': 0.34651898734177244, 'F': 0.6534810126582277}]


c) $P(B=T | C = T)$

In [19]:
print(model.predict_proba([[None, None, 'T', None]])[0][1].parameters)

[{'T': 0.8100843263425553, 'F': 0.1899156736574448}]


d) $P(B=T | A=T, C=T)$

In [22]:
print(model.predict_proba([['T', None, 'T', None]])[0][1].parameters)

[{'T': 0.8878923766816139, 'F': 0.11210762331838604}]


e) $P(C=T | A=F, B=F, D=F)$

In [23]:
print(model.predict_proba([['F', 'F', None, 'F']])[0][2].parameters)

[{'T': 0.10000000000000016, 'F': 0.8999999999999999}]


# Problem 2

In order to address this problem, we have selected the Iris dataset as the basis for comparing the performance of two classification algorithms: Naive Bayes and Random Forest.

In [24]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.model_selection import cross_val_score
iris = datasets.load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

a) 

We will train and test a Gaussian Naive Bayes, since there are no hyperparameters for this model, we will just train the base one.

In [44]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
predictions = gnb.predict(X_test)
print("GaussianNB:")
print(classification_report(y_test, predictions))

GaussianNB:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.93      0.93      0.93        15
           2       0.94      0.94      0.94        16

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50



b)

Now we will perform a GridSeach to find the best hyparameters for a RandomForestClassifier

In [26]:
grid_param = [
    {"n_estimators" : [10,50,100], "criterion" : ["gini", "entropy"], "max_depth" : [5,10,15,20],
     "min_samples_split" : [5,10,15,20], "min_samples_leaf" : [5,10,15,20]}
]

grid_search = GridSearchCV(RandomForestClassifier(), grid_param, scoring="accuracy", cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_depth': [5, 10, 15, 20],
                          'min_samples_leaf': [5, 10, 15, 20],
                          'min_samples_split': [5, 10, 15, 20],
                          'n_estimators': [10, 50, 100]}],
             scoring='accuracy')

In [27]:
print("Best Parameters found:")
print(grid_search.best_params_)

Best Parameters found:
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 15, 'min_samples_split': 20, 'n_estimators': 10}


In [28]:
predictions_b = grid_search.predict(X_test)
print(classification_report(y_test, predictions_b))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        16

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



The Random Forest classifier outperformed the Gaussian Naive Bayes model in terms of accuracy, precision, recall, and f1-score. However, the perfect scores achieved by the Random Forest model may indicate overfitting, which could lead to poor generalization on unseen data.