Trains a neural network on a given dataset and saves test accuracy into a json file. Examine effect of selecting the network's "most confident" predictions and only looking at those.

In [None]:
import os
import re
import datetime

import machine_learning as ml
import preprocessing as pp
from verify_dataset_hash import verify_json_metadata, get_repo_sha1_and_diff

data_directory: directory containing measured data (train and test datasets)

debug: Set True if you want to see the header detection result during the pre processing

use_new_data: Set True if data needs to be loaded from a txt file for the first time (saves time otherwise)

In [None]:
data_directory=os.path.join("..", "DownloadedData", "Rev1Distance")

debug=False
use_new_data=False
num_test_data_sets = 3

offset_header_rev1 = -200
offset_header_rev2 = 200
if "rev1" in data_directory.lower():
    offset_header = offset_header_rev1
elif "rev2" in data_directory.lower():
    offset_header = offset_header_rev2
else:
    raise NotImplementedError("Specify which offset should be used!")

In [None]:
# Technical details of your experimental setup
technical_details = {
    "header_length": 1,  # Number of symbols used to identify the header
    "clock_freq": int(100e6),  # Clock frequency (Hz) of the sender electronics
    "sample_freq": int(1e10),  # Sample frequency (Hz) used by oscilloscope
    "signal_length": 2_000_002,  # Total number of data points of one measurement
    "steps_to_left": 2,  # start for the cut position in symbols to the left from the header position
    "steps_to_right": 3,  # end for the cut position in symbols to the right from the header position
    "do_normalize_data": True,  # Use if data should be normalized to have zero mean and std 1
}

# Array of the different distances in cm (folder names) used for the measurement
positions = [dirname for dirname in os.listdir(data_directory) 
                if os.path.isdir(os.path.join(data_directory,dirname))]

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [atoi(c) for c in re.split(r'(\d+)', text) ]

positions.sort(key=natural_keys)

train_accuracy = {} # Tracks the resulting train accuracy
validation_accuracy = {}  # Tracks the resulting validation accuracy
test_accuracy = {}  # Tracks the resulting test accuracy

machine_learning_settings = {
    "dropout": 0.1,
    "training_EPOCHS": 4,
}

print(positions)

data_augmentation_halflength = 1

stop_at_val_acc = 0.99

label_mapping = pp.translate_all  # lambda labels: pp.translate_all_onlybit(labels, '0101') #
num_classes = 4 if label_mapping == pp.translate_all else 2

In [None]:
def evaluate_all_positions(positions, technical_details, num_test_data_sets, debug, num_classes):
    print(f"Evaluating datasets: {positions}")
    
    for i, elem in enumerate(positions):
        print(f"Loading dataset {i}: {elem}")
        training_data, training_labels, validation_data, validation_labels = pp.get_datasets(
            os.path.join(data_directory, elem), technical_details, offset_header=offset_header,
            data_augmentation_halflength=data_augmentation_halflength, debug=debug, force_create_npy_files=use_new_data
        )
        
        model = ml.get_model_hyperparam_improved(technical_details, num_classes)
        # model = ml.create_model_residual(technical_details)

        hist = ml.train_model(model,
                       training_data, training_labels,
                       machine_learning_settings,
                       validation_data=validation_data,
                       validation_labels=validation_labels,
                       training_run_name=str(positions),
                       early_stopping_patience=4,
                       label_mapping=label_mapping,
                       stop_at_val_acc=stop_at_val_acc,
                      )

        list_acc = []
        train_accuracy[f"{elem}"] = hist.history["accuracy"][-1]
        validation_accuracy[f"{elem}"] = hist.history["val_accuracy"][-1]
        for j in range(0,num_test_data_sets):
            test_data, test_labels, test_start = pp.load_test_datasets(
                os.path.join(data_directory, elem), technical_details,
                data_index=j, offset_header=offset_header,
            )
            list_acc.append(
                ml.test_model(model, test_data, test_labels, 
                              test_start, technical_details, label_mapping=label_mapping,
                             most_certain_fraction=0.01,)
            )

        test_accuracy[f"{elem}"] = list_acc
        print("\n")
        pp.write_result_dict({"test_accuracies": test_accuracy, "train_accuracies": train_accuracy, 
                      "validation_accuracies": validation_accuracy}, target_file_path=result_file_path, override=True, )
        
    return test_accuracy

In [None]:
git_sha1, git_diff = get_repo_sha1_and_diff(search_parent_directories=True)
result_file_path = os.path.join(data_directory, f"result_TEMPEST_{datetime.datetime.now().strftime('%Y%m%d-%H%M')}.json")
try:
    measurement_metadata = verify_json_metadata(data_directory, verbose=False)
except:
    measurement_metadata = "FAILURE_TO_GET_METADATA"

test_accuracy = evaluate_all_positions(positions, technical_details, num_test_data_sets, debug, num_classes)
pp.write_result_dict({
    'test_accuracies': test_accuracy, 
    'train_accuracies': train_accuracy, 
    'validation_accuracies': validation_accuracy,
    'measurement_metadata': measurement_metadata, 
    'n_classes': num_classes,
    'data_augmentation_halflength': data_augmentation_halflength,
    }, 
    target_file_path=result_file_path, override=True, )

print(" ------------ Done ---------------")

In [None]:
print(f"{test_accuracy}")

# ------------------ Metadata ------------------

In [None]:
# repository git information (WARNING: THIS IS ONLY RELIABLE IF NO CHANGES WERE MADE BETWEEN RUNNING DIFFERENT NOTEBOOK CELLS!)
print(git_sha1)
print("\nFull git diff (only Python source files!):\n")
[print(d) for d in git_diff if d.a_path.endswith(".py")]