In [2]:
import math

# A1.1
# Q1: Tree x3 then x1 then x2 all on the left.
# Q2: Yes, for this example, there exists a smaller tree that perfectly classify all training examples. This tree is not found by ID3 since it is a greedy algorithm (the attribute selection at a node is done without regard to the selection at children nodes).

# Q3:
# Gain = Entropy before – Sum of proportion  (Chosen / all) * entropy of chosen
def Gain(S, xi):
    sum_S = sum(S)
    p_pos, p_neg = S[0]/sum_S, S[1]/sum_S
    entropy_S = -p_pos * math.log2(p_pos) -p_neg * math.log2(p_neg)
    
    sumvalue = 0
    for v in range(len(xi)):
        mult = sum(xi[v]) / sum_S    # |S_v| / |S|
        sum_Xiv = sum(xi[v])
        p_posv, p_negv = xi[v][0]/sum_Xiv, xi[v][1]/sum_Xiv
        
        if (p_posv == 0): entropy_Sv = -p_negv * math.log2(p_negv)
        elif (p_negv == 0): entropy_Sv = -p_posv * math.log2(p_posv)
        else: entropy_Sv = -p_posv * math.log2(p_posv) -p_negv * math.log2(p_negv)
        sumvalue += (mult*entropy_Sv)
    return entropy_S - sumvalue

S = (10, 10)
xi = [(8, 2), (2, 8)]
first = Gain(S, xi)
xi = [(10, 6), (0, 4)]
second = Gain(S, xi)
print("Q3: {0:.4f}, {1:.4f}".format(first, second))

# Q4: [8+,2−] (left)  [2+,8−] (right)

# Q5:
S = (100, 10)
xi = [[80, 2], [20, 8]]
first = Gain(S, xi)
xi = [[100, 6], [0, 4]]
second = Gain(S, xi)
print("Q5: {0:.4f}, {1:.4f}".format(first, second))

# Q6: [100+,6−] (left)  [0+,4−] (right)

# Q7: information gain is sensitive to class cost imbalance.
#     with c⊕=10 and c⊖=1

# Q8: c^k^d   and then ID3: Sum(k^i * (d-i))

# Q9: The decision boundaries are parallel to the axes. Thus, the instance space is split into rectangles.
#     The number of regions depends on the number of classes.

Test: 0.1232, 0.1434
Q3: 0.2781, 0.2365
Q5: 0.0965, 0.1371


In [5]:
# A1.2
# Q1
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

train_df = pd.read_csv("WineTrain.csv", index_col=0)
test_df = pd.read_csv("WineTest.csv", index_col=0)

classifier = DecisionTreeClassifier(random_state=0)
fit = classifier.fit(train_df.drop("Quality", axis=1), train_df["Quality"])

y_train_pred = fit.predict(train_df.drop("Quality", axis=1))
y_test_pred = fit.predict(test_df.drop("Quality", axis=1))

# Q2: nb_nodes, train_accur, test_accur.
def accuracy(pred, true):
    well = 0
    total = 0
    for i in range(len(pred)):
        if pred[i] == true[i]: 
            well += 1
        total += 1
    return well / total

print("nb_nodes = {}".format(classifier.tree_.node_count))
print("train_accur = {}".format(accuracy(y_train_pred, train_df["Quality"].to_numpy())))
print("test_accur = {}".format(accuracy(y_test_pred, test_df["Quality"].to_numpy())))

# Q3: Only if the data is consistent (no 2 examples with same features x1=x2 but different outcome y1≠y2).

# Q4:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

frame = pd.DataFrame(columns = ["Run", "NodeCount", "TrainAcc", "TestAcc"])

data = {}
for i in range(100):
    data[i] = train_df.sample(int(len(train_df)*0.25), random_state=i)
    
    classifier = DecisionTreeClassifier(random_state=0)
    fit = classifier.fit(data[i].drop("Quality", axis=1), data[i]["Quality"])
    y_train_pred = fit.predict(data[i].drop("Quality", axis=1))
    y_test_pred = fit.predict(test_df.drop("Quality", axis=1))
    
    temp = {'Run': i, 'NodeCount': classifier.tree_.node_count, 'TrainAcc': accuracy(y_train_pred, data[i]["Quality"].to_numpy()), 'TestAcc': accuracy(y_test_pred, test_df["Quality"].to_numpy())}
    frame = frame.append(temp, ignore_index=True)

mean_nb_nodes = np.mean(frame["NodeCount"].to_numpy())
mean_train_accur = np.mean(frame["TrainAcc"].to_numpy())
mean_test_accur = np.mean(frame["TestAcc"].to_numpy())
print()
print(mean_nb_nodes, mean_train_accur, mean_test_accur)

# Q5: 
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

frame = pd.DataFrame(columns = ["Frac", "Run", "NodeCount", "TrainAcc", "TestAcc"])
for frac in [0.05, 0.1, 0.2, 0.5, 0.99]:
    for i in range(100):
        data = train_df.sample(int(len(train_df)*frac), random_state=i)

        classifier = DecisionTreeClassifier(random_state=0)
        fit = classifier.fit(data.drop("Quality", axis=1), data["Quality"])
        y_train_pred = fit.predict(data.drop("Quality", axis=1))
        y_test_pred = fit.predict(test_df.drop("Quality", axis=1))

        temp = {'Frac': frac, 'Run': i, 'NodeCount': classifier.tree_.node_count, 'TrainAcc': accuracy(y_train_pred, data["Quality"].to_numpy()), 'TestAcc': accuracy(y_test_pred, test_df["Quality"].to_numpy())}
        frame = frame.append(temp, ignore_index=True)

mean_nb_nodes = np.mean(frame["NodeCount"].to_numpy())
mean_train_accur = np.mean(frame["TrainAcc"].to_numpy())
mean_test_accur = np.mean(frame["TestAcc"].to_numpy())
print()
print(mean_nb_nodes, mean_train_accur, mean_test_accur)

# Q6: The test accuracy, as a function of the training set size, increases most when there are few training samples. This increase then slows down once the training sizes get larger. We may expect to have reached a plateau, meaning the accuracy won't vary much if more examples are added to the training data.
#     The number of nodes of the trees increases with the training set size because more splits are needed to classify all training examples.
#     The variance of the test accuracy decreases when the training set size increases because of the overlap between random data split increases.

nb_nodes = 445
train_accur = 1.0
test_accur = 0.5008347245409015

120.78 1.0 0.5037228714524207
     Frac   Run  NodeCount  TrainAcc   TestAcc
0    0.05   0.0       27.0       1.0  0.529215
1    0.05   1.0       25.0       1.0  0.484140
2    0.05   2.0       29.0       1.0  0.500835
3    0.05   3.0       29.0       1.0  0.509182
4    0.05   4.0       25.0       1.0  0.505843
..    ...   ...        ...       ...       ...
495  0.99  95.0      435.0       1.0  0.515860
496  0.99  96.0      443.0       1.0  0.505843
497  0.99  97.0      445.0       1.0  0.490818
498  0.99  98.0      427.0       1.0  0.515860
499  0.99  99.0      421.0       1.0  0.520868

[500 rows x 5 columns]

169.624 1.0 0.49546911519198666


In [3]:
# A1.3
# Q1
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
def accuracy(pred, true):
    well = 0
    total = 0
    for i in range(len(pred)):
        if pred[i] == true[i]: 
            well += 1
        total += 1
    return well / total

train_df = pd.read_csv("WineTrain.csv", index_col=0)
test_df = pd.read_csv("WineTest.csv", index_col=0)

temptemp = pd.DataFrame(columns = ["min_samples_split", "NodeCount", "TrainAcc", "TestAcc"])
# print("Param", "NbrNode", "TrainAcc", "TestAcc", "AllAcc")
for j in [2, 120, 1200]:
    frame = pd.DataFrame(columns = ["min_samples_split", "NodeCount", "TrainAcc", "TestAcc"])
    for i in range(100):
        data = train_df

        classifier = DecisionTreeClassifier(random_state=0, min_samples_split=j)
        fit = classifier.fit(data.drop("Quality", axis=1), data["Quality"])
        y_train_pred = fit.predict(data.drop("Quality", axis=1))
        y_test_pred = fit.predict(test_df.drop("Quality", axis=1))

        temp = {'min_samples_split': j, 'NodeCount': classifier.tree_.node_count, 'TrainAcc': accuracy(y_train_pred, data["Quality"].to_numpy()), 'TestAcc': accuracy(y_test_pred, test_df["Quality"].to_numpy())}
        frame = frame.append(temp, ignore_index=True)

    mean_nb_nodes = np.mean(frame["NodeCount"].to_numpy())
    mean_train_accur = np.mean(frame["TrainAcc"].to_numpy())
    mean_test_accur = np.mean(frame["TestAcc"].to_numpy()) 
    mean_all_accur = (mean_train_accur + mean_test_accur) / 2
    temptemp = temptemp.append({'min_samples_split': j, 'NodeCount': mean_nb_nodes, 'TrainAcc': mean_train_accur, 'TestAcc': mean_test_accur}, ignore_index=True)
    #print(j, round(mean_nb_nodes, 4), round(mean_train_accur, 4), round(mean_test_accur, 4), round(mean_all_accur, 4))
frame = temptemp
    
# 600 et + c'esr pareil on a plus que 3 nodes et à 1200 on passe à 1 node
# Entre 120 et 300, il n'y a rien d'interessant

# Param NbrNode TrainAcc TestAcc AllAcc
# 120 31.0 0.699 0.5843 0.6417 is best
# 1200 1.0 0.505 0.399 0.452 is worst
# 2 445.0 1.0 0.5008 0.7504 is inbetween, good overall because performs well on training

# Q2: Using an adequate value for min_samples_split, the number of nodes, as a function of the training set size, of the pruned tree follows the same trend as the number of nodes of the complete tree. The main difference is a multiplicative factor.
#     An adequatly pruned tree will have a higher test accuracy.
#     Pruning the tree is not useful when using a very small number of training examples.
#     Pruning the tree prevents overfitting.

# Q3
# # random_state in [42, 49, 69, 90, 120, 360, 18, 64, 128, 2, 74, 66, 33]:
print("Beginning")
best_acc = 0.0
best_param = {}
for criterion in ["gini", "entropy"]:
    for max_depth in [None, 3, 25, 50, 100, 300]:
        print("Avancement")
        for min_samples_split in [115, 120, 125]:
            for min_samples_leaf in [25, 30, 35, 40,]:
                for max_features in [None, 2, 4, 6, 8, 11]:
                    classifier = DecisionTreeClassifier(random_state=0, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features)
                    fit = classifier.fit(train_df.drop("Quality", axis=1), train_df["Quality"])
                    y_test_pred = fit.predict(test_df.drop("Quality", axis=1))
                    acc = accuracy(y_test_pred, test_df["Quality"].to_numpy())
                    if best_acc < acc:
                        print(acc)
                        best_acc = acc
                        best_param["criterion"] = criterion
                        best_param["max_depth"] = max_depth
                        best_param["min_samples_split"] = min_samples_split
                        best_param["min_samples_leaf"] = min_samples_leaf
                        best_param["max_features"] = max_features
print(best_acc)
print(best_param)

# 0.6310517529215359
# {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 120, 'min_samples_leaf': 35, 'max_features': None}
# 0.6260434056761269
# {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 0.1, 'min_samples_leaf': 51, 'min_weight_fraction_leaf': 0.0, 'max_features': None, 'min_impurity_decrease': 0.0, 'class_weight': None}

def my_best_tree(train):
    classifier = DecisionTreeClassifier(random_state=0, criterion='gini', max_depth=None, min_samples_split=120, min_samples_leaf=35, max_features=None)
    fit = classifier.fit(train.drop("Quality", axis=1), train["Quality"])
    return classifier

print(train_df.columns)
print(my_best_tree(train_df).feature_importances_)
# Alcohol, TotalSulfurDioxide, Sulphates, VolatileAcidity

Beginning
Avancement
0.5742904841402338
0.5826377295492488
0.6110183639398998
0.6310517529215359
Avancement
Avancement
Avancement
Avancement
Avancement
Avancement
Avancement
Avancement
Avancement
Avancement
Avancement
0.6310517529215359
{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 115, 'min_samples_leaf': 35, 'max_features': None}
Index(['FixedAcidity', 'VolatileAcidity', 'CitricAcid', 'ResidualSugar',
       'Chlorides', 'FreeSulfurDioxide', 'TotalSulfurDioxide', 'Density', 'pH',
       'Sulphates', 'Alcohol', 'Quality'],
      dtype='object')
[0.03849672 0.05746655 0.         0.00555444 0.         0.
 0.15030912 0.0138968  0.         0.15572624 0.57855013]


In [15]:
# A1.4

# Q1
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
def accuracy(pred, true):
    well = 0
    total = 0
    for i in range(len(pred)):
        if pred[i] == true[i]: 
            well += 1
        total += 1
    return well / total
train_df = pd.read_csv("WineTrain.csv", index_col=0)
test_df = pd.read_csv("WineTest.csv", index_col=0)

wl = DecisionTreeClassifier(random_state=0, max_depth=2)
fit = wl.fit(train_df.drop("Quality", axis=1), train_df["Quality"])
y_test_pred = fit.predict(test_df.drop("Quality", axis=1))
acc = accuracy(y_test_pred, test_df["Quality"].to_numpy())
# print(acc)  #0.5191986644407346

# Q2
# result = [0]*10
# for j in range(10):
#     y_test_preds = pd.DataFrame()
#     print("Avancement")
#     for i in range(100):
#         data = train_df.sample(len(train_df), replace=True, random_state=j*i)
#         classifier = DecisionTreeClassifier(random_state=0, max_depth=2)
#         fit = classifier.fit(data.drop("Quality", axis=1), data["Quality"])
#         y_test_preds = y_test_preds.append([fit.predict(test_df.drop("Quality", axis=1))])
    
#     y_test_pred = [0]*599
#     for k in range(599):
#         high = 0
#         medium = 0
#         low = 0
#         for count in y_test_preds[k]:
#             if count == 'high': high+=1
#             elif count == 'medium': medium+=1
#             elif count == 'low': low+=1
#         best = max([high, medium, low])
#         if best == high:
#             y_test_pred[k] = 'high'
#         elif best == medium:
#             y_test_pred[k] = 'medium'
#         else:
#             y_test_pred[k] = 'low'
#     result[j] = accuracy(y_test_pred, test_df["Quality"].to_numpy())

# # print(result)
# mean_test_accur = np.mean(result)

# Q3
# Using bagging, with the weak learner and an appropriate number of trees, will yield better results than using a single tree with optimally tuned min_samples_split (like the one you used in question 1 of A1.3).
# Compared to our weak learner (a single tree), using bagging drastically improves the accuracy, even with only a few (e.g. 5) trees.
# At some point, we reach an optimal number of trees, and adding more trees will reduce the accuracy due to overfitting.

# Q4
from sklearn.ensemble import RandomForestClassifier
best_acc = 0.0
best_params = {}
for n_estimators in [55, 60, 65, 300]:
    for max_depth in [None, 5, 10, 20]:
        for min_samples_split in [5, 25, 50, 160]:
            for min_samples_leaf in [1, 5, 40]:
                classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
                fit = classifier.fit(train_df.drop("Quality", axis=1), train_df["Quality"])
                y_test_pred = fit.predict(test_df.drop("Quality", axis=1))
                acc = accuracy(y_test_pred, test_df["Quality"].to_numpy())
                if best_acc < acc:
                    print(acc)
                    best_acc = acc
                    best_params["n_estimators"] = n_estimators
                    best_params["max_depth"] = max_depth
                    best_params["min_samples_split"] = min_samples_split
                    best_params["min_samples_leaf"] = min_samples_leaf
print(best_acc, best_params)

# Random State=0
# 0.66110183639399 {'n_estimators': 60, 'max_depth': 5, 'min_samples_split': 50, 'min_samples_leaf': 1}

my_best_forest = RandomForestClassifier(n_estimators=60, max_depth=10, min_samples_split=50, min_samples_leaf=1)

0.6143572621035058
0.6410684474123539
0.6494156928213689
0.6510851419031719
0.66110183639399
0.66110183639399 {'n_estimators': 55, 'max_depth': 5, 'min_samples_split': 50, 'min_samples_leaf': 1}


In [11]:
p = pd.DataFrame(np.array([[0, 0, 0, 0, 0]]))
p = p.append([[1, 2, 3, 4, 5]])
p = p.append([[1, 2, 3, 4, 5]])
p = p.append([[1, 2, 3, 4, 5]])
p = p.append([[1, 2, 3, 4, 5]])
print(p[0].to_numpy())
print(p)

[0 1 1 1 1]
   0  1  2  3  4
0  0  0  0  0  0
0  1  2  3  4  5
0  1  2  3  4  5
0  1  2  3  4  5
0  1  2  3  4  5
