# 0 - IMPORTS AND CONSTANTS

All modules, imports, libraries and constant used in this file

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from seaborn import heatmap

# All the validation metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix



INPUT_FILE_PATH = 'mnist_784.csv'
METRICS = {accuracy_score, recall_score, precision_score, f1_score}
METRICS_PARAMETER={accuracy_score:None, recall_score:{'average':None}, precision_score:{'average':None}, f1_score:{'average':None}}


# 1 - DATA READING AND PARTITIONING

Load the MNIST dataset into memory. <br>
Divide the 70,000 digits you have into:
- training set (60,000 digits) 
- test set (10,000 digits).

In [30]:
def readMnistDataset(file:str)->pd.DataFrame:
    return pd.read_csv(filepath_or_buffer=file, header=0).rename(columns={'class':'label'})

# 2 - SINGLE RANDOM TREE

Train a single decision tree (with the default parameters) on the training set, then compute its
accuracy on the test set.

In [None]:
def computeMetrics(yTrue:pd.DataFrame, yPred:pd.DataFrame, metrics:list=METRICS, options:dict[dict]=METRICS_PARAMETER,
                    printMap: bool=False)->dict[str:float]:
    if printMap:
        fig, ax = plt.subplots(1,1,figsize=(12,12))
        heatmap(confusion_matrix(yTrue, yPred), annot=True, ax=ax)
        plt.show()
    
    return pd.DataFrame({str(metric).split()[1]:metric(yTrue, yPred, **options[metric]) if options[metric] else metric(yTrue, yPred)
            for metric in metrics})
        

def createAndTestSingleRandomTree(xTrain:pd.DataFrame, xTest:pd.DataFrame, yTrain:pd.DataFrame, yTest:pd.DataFrame, 
                                metrics:list=METRICS, options:dict[dict]=METRICS_PARAMETER, printMap: bool=False)->dict[str:float]:
    
    return computeMetrics(yTest, DecisionTreeClassifier().fit(xTrain, yTrain).predict(xTest),
                            metrics=metrics, printMap=printMap, options=options)
    

# MAIN FUNCTION

This is the main function of the program, it does the following things:
<ol>
<li>Reads the mnist dataset and partitions it</li>
<li>Trains a single decision tree and test it's accuracy</li>
<li></li>
<li></li>
<li></li>
<li></li>
<li></li>
</ol> 

In [None]:
def main()->None:
    df = readMnistDataset(INPUT_FILE_PATH) # 1
    xTrain, xTest, yTrain, yTest = train_test_split(df.loc[:, df.columns.difference({'label'})], 
                                                    df['label'], test_size=1/7, shuffle=True, stratify=df['label']) # 1
    
    res = createAndTestSingleRandomTree(xTrain, xTest, yTrain, yTest, printMap=True) # 2
    print("FINAL RESULTS OF A SINGLE DECISION TREE :") # 2
    display(res) # 2
    



main()

FINAL RESULTS OF A SINGLE DECISION TREE :


Unnamed: 0,f1_score,precision_score,accuracy_score,recall_score
0,0.933198,0.931313,0.8731,0.935091
1,0.950891,0.930272,0.8731,0.972444
2,0.840345,0.851129,0.8731,0.82983
3,0.840196,0.840196,0.8731,0.840196
4,0.866222,0.865779,0.8731,0.866667
5,0.817673,0.825056,0.8731,0.810421
6,0.904348,0.90853,0.8731,0.900204
7,0.904281,0.906461,0.8731,0.902111
8,0.826378,0.830228,0.8731,0.822564
9,0.828657,0.825349,0.8731,0.831992
