## Data Preprocessing

In [88]:
import re
import pandas as pd
from typing import List

In [98]:
def read_dataset(filepath: str) -> pd.DataFrame:
    with open(filepath) as raw_data:
        raw_lines = raw_data.readlines()
        lines = transform_raw_lines_to_lists(raw_lines)
        experiments_count = int(float(lines[1][0]))
        offset = 1
        data = []
        for _ in range(experiments_count):
            offset += 1
            experiment_header_line = lines[offset]
            experiment_data, offset = read_experiment_data(lines, experiment_header_line, offset)
            data += experiment_data
            
    return pd.DataFrame(data, columns=["atom_number", "beta", "dose", "mean_survivability", "min_survivability", "max_survivability", "error_of_measurement"])


def transform_raw_lines_to_lists(raw_lines: List[str]) -> List:
    lines = []
    for raw_line in raw_lines:
        line = re.split(r' ', raw_line)
        line = [number for number in line if number != '']
        lines.append(line)
    return lines


def read_experiment_data(lines: List, experiment_header_line: List, offset: int) -> (List, int):
    experiment_count = int(float(experiment_header_line[0]))
    atom_number = int(float(experiment_header_line[1]))
    beta = float(experiment_header_line[2])
    return read_measurement_points(experiment_count, lines, offset, atom_number, beta)


def read_measurement_points(experiment_count: int, lines: List[str], offset: int, atom_number: int, beta: float) -> (List, int):
    # atom_number, beta, dose, mean_survivability, min_survivability, max_survivability, error_of_measurement
    measurements_data = []
    for _ in range(experiment_count):
        offset += 1
        line = lines[offset]
        measurement_data = read_measurement_point(line, atom_number, beta)
        measurements_data.append(measurement_data)
    return measurements_data, offset


def read_measurement_point(line: List, atom_number: int, beta: float) -> List:
    return [atom_number, beta] + [float(number) for number in line]

              

In [99]:
dataset = read_dataset("../data/SUR5_Train.DAT")