In [0]:
# imports and variables

# imports
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import cv2
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow import data as tf_data
import time
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, log_loss
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from typing import List, Dict, Any, Union

from utils import print_progress_bar

print("Modules imported")

In [5]:
# global variables
all_labels = ['nature', 'country', 'city']
path = "../datasets/all_data/entropy_results_augmented.json"

# hyperparameters
model_hp = {
    "test_part": 0.1,
    "epochs": 20,
    "batch_size": 64,
    "learning_rate": 0.005,
    "num_classes": len(all_labels)
}

# check gpu
devices = tf.config.list_physical_devices()
print("Available devices:")
for device in devices:
    print(device.name)
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print(f"The model will run on GPU: {physical_devices[0].name}")
else:
    print("No GPU found, the model will run on CPU.")

Available devices:
/physical_device:CPU:0
/physical_device:GPU:0
The model will run on GPU: /physical_device:GPU:0


In [2]:
# main model
class EntropyClassifier(tf.keras.Model):
    def __init__(self, possible_labels, folder, debug=False):
        super(EntropyClassifier, self).__init__()
        dwt_output_size = 10
        lvl0_output_size = 17
        lvl1_output_size = 17
        lvl2_output_size = 17
        lvl3_output_size = 153

        # embed_size = dwt_output_size + lvl0_output_size + lvl1_output_size + lvl2_output_size + lvl3_output_size  # = 214
        # 
        # heads = 1  # Choose based on your specific requirements or experimentation
        # assert embed_size % heads == 0, "Embedding size needs to be divisible by heads"

        self.possible_labels = possible_labels
        self.debug = debug
        self.folder = folder

        # input layers
        self.dwt_input_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(dwt_output_size),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dropout(0.5)
        ])
        self.lvl0_input_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(lvl0_output_size),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dropout(0.5)
        ])
        self.lvl1_input_layers = [tf.keras.Sequential([
            tf.keras.layers.Conv2D(1, (2, 2)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dropout(0.5)
        ]) for _ in range(lvl1_output_size)]
        self.lvl2_input_layers = [tf.keras.Sequential([
            tf.keras.layers.Conv2D(1, (2, 2)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)),
            tf.keras.layers.Dropout(0.5)
        ]) for _ in range(lvl2_output_size)]
        self.lvl3_input_layers = [tf.keras.Sequential([
            tf.keras.layers.Conv2D(1, (2, 2)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)),
            tf.keras.layers.Dropout(0.5)
        ]) for _ in range(lvl2_output_size)]

        # main nn block
        self.final_nn = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(len(possible_labels), activation='softmax')
        ])

    def call(self, inputs, training=False):
        lvl0_inputs, lvl1_inputs, lvl2_inputs, lvl3_inputs, dwt_inputs = inputs

        batch_size = tf.shape(dwt_inputs)[0]

        # Ensure inputs have a batch dimension
        if len(lvl1_inputs.shape) == 3:
            lvl1_inputs = tf.expand_dims(lvl1_inputs, axis=0)
        if len(lvl2_inputs.shape) == 3:
            lvl2_inputs = tf.expand_dims(lvl2_inputs, axis=0)
        if len(lvl3_inputs.shape) == 3:
            lvl3_inputs = tf.expand_dims(lvl3_inputs, axis=0)

        dwt_output = self.dwt_input_layer(dwt_inputs)
        lvl0_output = self.lvl0_input_layer(lvl0_inputs)

        lvl1_output = tf.concat([self.lvl1_input_layers[i](lvl1_inputs[:, :, :, i:i + 1]) for i in range(17)], axis=-1)
        lvl2_output = tf.concat([self.lvl2_input_layers[i](lvl2_inputs[:, :, :, i:i + 1]) for i in range(17)], axis=-1)
        lvl3_output = tf.concat([self.lvl3_input_layers[i](lvl3_inputs[:, :, :, i:i + 1]) for i in range(17)], axis=-1)

        concatenated_output = tf.concat([tf.reshape(dwt_output, [batch_size, -1]),
                                         tf.reshape(lvl0_output, [batch_size, -1]),
                                         tf.reshape(lvl1_output, [batch_size, -1]),
                                         tf.reshape(lvl2_output, [batch_size, -1]),
                                         tf.reshape(lvl3_output, [batch_size, -1])], axis=-1)

        # Commenting out the original self-attention layer
        # attention_output = self.self_attention(concatenated_output)

        # Using the combined dense layers
        final_output = self.final_nn(concatenated_output)

        if self.debug:
            print(dwt_output.shape)
            print(lvl0_output.shape)
            print(lvl1_output.shape)
            print(lvl2_output.shape)
            print(lvl3_output.shape)
            print(concatenated_output.shape)
            print(final_output.shape)

        return final_output

    def train_model(self, dataset, epochs=100, batch_size=64, lr=0.01):
        formatted_dataset = format_dataset(dataset)
        train_dataset = formatted_dataset.batch(batch_size)

        loss = None
        t0 = time.time()
        n = int(len(dataset) // batch_size) + 1

        criterion = CategoricalCrossentropy(from_logits=False)
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=lr,
            decay_steps=10000,
            decay_rate=0.9)
        optimizer = Adam(learning_rate=lr_schedule, clipnorm=1.0)

        checkpoint_filepath = os.path.join(self.folder, 'checkpoint.chk')
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=True,
            monitor='loss',
            mode='min',
            save_best_only=True)

        for epoch in range(epochs):
            print(f'Starting epoch {epoch + 1}/{epochs}...')
            t1 = time.time()

            for batch_idx, (data, target) in enumerate(train_dataset):
                target = tf.convert_to_tensor([tf.one_hot(t, len(self.possible_labels)) for t in target],
                                              dtype=tf.float32)

                with tf.GradientTape() as tape:
                    output = self(data, training=True)
                    loss = criterion(target, output)
                gradients = tape.gradient(loss, self.trainable_variables)
                optimizer.apply_gradients(zip(gradients, self.trainable_variables))

                print_progress_bar('Batches processed', batch_idx + 1, n, start_time=t1)

            if loss is not None:
                elps = int(time.time() - t0)
                elps_m, elps_s = divmod(int(elps), 60)
                print(f'\nEpoch {epoch + 1} completed, Loss: {loss.numpy()}, Time elapsed: {elps_m}:{elps_s}')

            # Save the model at the end of each epoch
            self.save_weights(checkpoint_filepath)

    def predict(self, inputs):
        if len(inputs[0].shape) != 4 or inputs[0].shape[0] != 1:
            raise ValueError("Input batch size should be 1")

        probabilities = self(inputs, training=False)
        max_index = tf.argmax(probabilities, axis=1)
        max_index = int(max_index[0].numpy())
        label_prob_dict = {label: prob.numpy() for label, prob in zip(self.possible_labels, probabilities[0])}

        return str(self.possible_labels[max_index]), label_prob_dict

In [3]:
# data preprocessing

# functions
def process_entry(entry):
    """Process the entropy results to extract the levels."""
    label = entry['label']
    machine_input = {0: [], 1: [], 2: [], 3: [], 'dwt': []}
    for ent in entry['entropy_results']:

        if ent['method'] == 'dwt':
            machine_input['dwt'] = tf.convert_to_tensor(ent['result'], dtype=tf.float32)
        else:
            for lvl, content in enumerate(ent['result']):
                machine_input[lvl].append(tf.convert_to_tensor(content, dtype=tf.float32))

    machine_input[0] = tf.concat(machine_input[0], axis=-1)
    machine_input[1] = tf.concat(machine_input[1], axis=-1)
    machine_input[2] = tf.concat(machine_input[2], axis=-1)
    machine_input[3] = tf.concat(machine_input[3], axis=-1)
    machine_input['dwt'] = tf.reshape(machine_input['dwt'], [1, 1, 10])

    return {'input': machine_input, 'label': label}


def format_dataset(dataset):
    """Formats and shuffles the dataset for training"""
    label_num = {'nature': 0, 'country': 1, 'city': 2}
    formatted_dataset = []
    for item in dataset:
        machine_input = item['input']
        label = label_num[item['label']]
        formatted_dataset.append((
            (
                machine_input[0],
                machine_input[1],
                machine_input[2],
                machine_input[3],
                machine_input['dwt']
            ),
            label
        ))
    return tf.data.Dataset.from_generator(
        lambda: iter(formatted_dataset),
        output_signature=(
            (
                tf.TensorSpec(shape=(1, 1, 17), dtype=tf.float32),
                tf.TensorSpec(shape=(2, 2, 17), dtype=tf.float32),
                tf.TensorSpec(shape=(4, 4, 17), dtype=tf.float32),
                tf.TensorSpec(shape=(8, 8, 17), dtype=tf.float32),
                tf.TensorSpec(shape=(1, 1, 10), dtype=tf.float32)
            ),
            tf.TensorSpec(shape=(), dtype=tf.int32),
        )
    ).shuffle(buffer_size=len(dataset))


def process_json(path_, test_part, parallel_jobs=5):
    """Process JSON data to extract dataset and features."""
    with open(path_, 'r') as f:
        metadata = json.load(f)
    print('json loaded')
    dataset = []

    t = time.time()
    n = len(metadata)

    with ThreadPoolExecutor(max_workers=parallel_jobs) as executor:
        futures = [executor.submit(process_entry, entry) for entry in metadata]
        for i, future in enumerate(as_completed(futures)):
            result = future.result()
            if result is not None:
                dataset.append(result)
            print_progress_bar('Processed entry', i + 1, n, t)

    if isinstance(test_part, float):
        i = int(test_part * len(dataset))
    elif isinstance(test_part, str):
        i = int(test_part)
    else:
        raise ValueError("Incompatible format for 'test_part'.")

    testset = dataset[-i:]
    dataset = dataset[:-i]

    return dataset, testset

In [6]:
# load data
data_set, test_set = process_json(path, model_hp['test_part'])
print('\nDataset processed.')

Processed entry: ██████████████████████████████████████████████████ |84768/84768; 100.0%| t=0:04:06/0:04:06, left~=0:00:00|1:51:49|1|
Dataset processed.


In [8]:
# training data
model_hp['dataset_length'] = len(data_set)
model_hp['testset_length'] = len(test_set)
print(f"Total number of entries in the dataset: {len(data_set)}")
print(f"Total number of entries in the test set: {len(test_set)}")

Total number of entries in the dataset: 76292
Total number of entries in the test set: 8476


In [9]:
# model creation
model_folder = f"../models/EntropyClassifier"
model_name = f"instance_e={model_hp['epochs']}_ds={model_hp['dataset_length']}_bs={model_hp['batch_size']}"
model = EntropyClassifier(all_labels, model_folder)
print('Model created')

Model created


In [10]:
# train model
model.train_model(data_set, epochs=model_hp['epochs'], batch_size=model_hp['batch_size'], lr=model_hp['learning_rate'])
print('Model trained.')

Starting epoch 1/20...
Batches processed: ██████████████████████████████████████████████████ |1193/1193; 100.0%| t=0:06:13/0:06:13, left~=0:00:00|
Epoch 1 completed, Loss: 0.7806026935577393, Time elapsed: 6:13
Starting epoch 2/20...
Batches processed: ██████████████████████████████████████████████████ |1193/1193; 100.0%| t=0:05:43/0:05:43, left~=0:00:00|
Epoch 2 completed, Loss: 0.44041168689727783, Time elapsed: 11:56
Starting epoch 3/20...
Batches processed: ██████████████████████████████████████████████████ |1193/1193; 100.0%| t=0:05:42/0:05:42, left~=0:00:00|
Epoch 3 completed, Loss: 1.0508629083633423, Time elapsed: 17:39
Starting epoch 4/20...
Batches processed: ██████████████████████████████████████████████████ |1193/1193; 100.0%| t=0:05:46/0:05:46, left~=0:00:00|
Epoch 4 completed, Loss: 0.6889659762382507, Time elapsed: 23:25
Starting epoch 5/20...
Batches processed: ██████████████████████████████████████████████████ |1193/1193; 100.0%| t=0:05:46/0:05:46, left~=0:00:00|
Epoch

In [11]:
# save model
if not os.path.exists(model_folder):
    os.mkdir(model_folder)
model.save_weights(os.path.join(model_folder, model_name))
print("Model saved")

Model saved


In [6]:
# load model
model.load_weights(os.path.join(model_folder, model_name))
print("model loaded")

model loaded


In [12]:
# model evaluation

# functions
def evaluate_model(model_, testset):
    """
    Evaluate the given model on a test dataset.
    
    Parameters:
    model_ (tf.keras.Model): The model to be evaluated
    testset (list): List of dictionaries, each containing 'input' and 'label' for a test sample
    
    Returns:
    dict: Dictionary containing 'y_true', 'y_pred', and 'y_prob' lists for true labels, 
          predicted labels, and predicted probabilities, respectively.
    """
    y_true = []
    y_pred = []
    y_prob = []

    for test in testset:
        machine_input = test['input']
        inputs = (
            tf.expand_dims(machine_input[0], axis=0),
            tf.expand_dims(machine_input[1], axis=0),
            tf.expand_dims(machine_input[2], axis=0),
            tf.expand_dims(machine_input[3], axis=0),
            tf.expand_dims(machine_input['dwt'], axis=0)
        )
        true_label = test['label']

        predicted_label, label_probs = model_.predict(inputs)

        y_true.append(true_label)
        y_pred.append(predicted_label)
        y_prob.append(label_probs)

    model_op = {'y_true': y_true, 'y_pred': y_pred, 'y_prob': y_prob}

    return model_op


def calculate_stats(model_op):
    """
    This function calculates various statistics like confusion matrix, precision, recall, F1 score, and log loss
    based on the model's output.
    
    Parameters:
    model_op (dict): A dictionary containing 'testing_data' which itself is a dictionary with 'y_true', 'y_pred', 
                     and 'y_prob' as keys for true labels, predicted labels, and predicted probabilities, respectively.
    
    Returns:
    dict: A dictionary containing all the calculated statistics including 'test_samples', 'right_predictions', 
          'success_rate', 'confusion_matrix', 'precision', 'recall', 'f1_score', and 'log_loss'.
    """

    y_true = np.array(model_op['y_true']).reshape(-1, 1)
    y_pred = np.array(model_op['y_pred']).reshape(-1, 1)
    y_prob = np.array([list(item.values()) for item in model_op['y_prob']])

    stats = {'test_samples': len(y_true), 'right_predictions': 0}

    for i in range(stats['test_samples']):
        if y_pred[i] == y_true[i]:
            stats['right_predictions'] += 1
            print(f'model predicted label {y_true[i]} correctly.')
        else:
            print(f'model gave false prediction of label {y_pred[i]}, while {y_true[i]} was true.')

    stats['success_rate'] = 100 * stats['right_predictions'] / stats['test_samples']

    stats.update({
        'confusion_matrix': confusion_matrix(y_true, y_pred).tolist(),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted'),
        'f1_score': f1_score(y_true, y_pred, average='weighted'),
        'log_loss': compute_log_loss(y_true, y_prob)
    })

    print(f"{stats['right_predictions']} samples out of {stats['test_samples']} were predicted correctly.\n"
          f"The model's success rate is: {stats['success_rate']:.2f}%")
    print(f"Confusion Matrix: \n{stats['confusion_matrix']}")
    print(f"Precision: {stats['precision']}")
    print(f"Recall: {stats['recall']}")
    print(f"F1 Score: {stats['f1_score']}")
    print(f"Log Loss: {stats['log_loss']}")

    return stats


def compute_log_loss(y_true, y_prob):
    """
    Compute the log loss manually.
    
    Parameters:
    y_true (list): List of true labels as strings
    y_prob (numpy array): Array of shape (x, 3) representing predicted probabilities
    all_labels (list): List of all possible labels as strings in the correct order
    
    Returns:
    float: Computed log loss
    """
    label_to_index = {label: index for index, label in enumerate(all_labels)}
    n_samples = len(y_true)
    log_loss_value = 0

    for i in range(n_samples):
        true_label = str(y_true[i][0])
        true_index = label_to_index[true_label]
        prob = y_prob[i, true_index]
        log_loss_value += -np.log(prob)

    return log_loss_value / n_samples


def save_performance_to_json(folder_path, hp, stats, comment_):
    """
    Save the performance metrics and other details to a JSON file.
    
    Parameters:
    folder_path (str): The folder where the performance JSON will be saved
    hp (dict): Dictionary containing hyperparameters like learning rate, epochs, etc.
    stats_ (dict): Dictionary containing performance metrics
    comment_ (str): Any additional comments
    
    Returns:
    None
    """
    perf_json = f"{folder_path}/performance.json"

    if os.path.exists(perf_json):
        with open(perf_json, 'r') as f:
            perf = json.load(f)
    else:
        perf = []

    perf.append({
        'hyperparameters': hp,
        'performance': stats,
        'comment': comment_
    })

    with open(perf_json, 'w') as f:
        json.dump(perf, f, indent=4)

In [13]:
# evaluate model on test set
model_output = evaluate_model(model, test_set)
print('model evaluated on test set.')

model evaluated on test set.


In [14]:
# calculate stats
comment = 'new architecture. less layers.'

model_stats = calculate_stats(model_output)
save_performance_to_json(model_folder, model_hp, model_stats, comment)
print('model stats calculated.')

model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['country'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly.
model predicted label ['city'] correctly

In [24]:
print(f"nature: {len([1 for x in model_output['y_pred'] if x=='nature'])}")
print(f"city: {len([1 for x in model_output['y_pred'] if x=='city'])}")
print(f"country: {len([1 for x in model_output['y_pred'] if x=='country'])}")

nature: 398
city: 0
country: 351
