{% block title %}
{% endblock %}

## Importing the libraries

In [None]:
{% block imports %}
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
{% endblock %}

## Start stopwatch

In [None]:
{% block stopwatch %}
from time import process_time
time_start = process_time() 
{% endblock %}

## Import the dataset

In [None]:
{% block dataset %}
X = pd.read_csv('{{ features_file_path }}')
y = pd.read_csv('{{ labels_file_path }}')
{% endblock %}

### Categorize dataset

In [None]:
def categorize_column(dataframe, category_threshold, column):
    
    uniq = dataframe[column].unique()
    if len(uniq) <= category_threshold:
        dataframe[column] = dataframe[column].astype('category').cat.codes
        

def categorize_dataframe(dataframe, category_threshold):
    for c in dataframe.columns:
        categorize_column(dataframe, category_threshold, c)
        
categorize_dataframe(X, {{ category_threshold }})
categorize_dataframe(y, {{ category_threshold }})
X = X.values
y = y.values


## Splitting the dataset into the Training set and Test set

In [None]:
{% block split %}
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size={{train_split.test_size}},
                        train_size={{train_split.train_size}},
                        random_state={{train_split.random_state}},
                        shuffle={{train_split.random_state}},
                        stratify=X if "{{train_split.stratify}}" == "features" else y if "{{train_split.stratify}}" == "labels" else None)
{% endblock %}

## Training the Simple Decision Tree model on the Training set

In [None]:
{% block train %}
{% endblock %}

## Predicting the Test set results

In [None]:
{% block predict %}
y_pred = classifier.predict(X_test)
{% endblock %}

## Evaluating the Model Performance

In [None]:
{% block evaluate %}
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix
import seaborn as sns

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)**0.5
f1 = f1_score(y_test, y_pred)
print(f'Accuracy Score: {acc}')
print(f'Precision Score: {precision_score}')
print(f'Recall Score: {recall_score}')
print(f'F1 Score: {f1_score}')

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm)           
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);  
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix');           
plt.show()

time_stop = process_time()
cpu_time = round(time_stop - time_start, 2)
print(f'Elapsed CPU Time: {cpu_time} seconds')
{% endblock %}

## Saving Model Statistics

In [None]:
{% block save %}
import os
import json

path = 'statistics'
if not os.path.exists(path):
    os.mkdir(path)

stats = {
    "Accuracy Score": acc,
    "Precision Score": precision,
    "Recall Score": recall,
    "F1 Score": f1,
    "cpu time": cpu_time,
    "predicted": y_pred.flatten().tolist(), 
    "real": y_test.flatten().tolist()
}

with open(os.path.join(path, "{{ model_name }}.json"), "w") as f:
    json.dump(stats, f, ensure_ascii=False, indent=4)
{% endblock %}