## Data Preprocessing

In [10]:
import re
import pandas as pd
from typing import List

In [11]:
def read_dataset(filepath: str) -> pd.DataFrame:
    """
    Function reads data (containing the results of firing ions at cancer cells) from filepath
    and saves it as pandas DataFrame
    
    The idea of preparing data:
        - read whole data (including e. g. redundant header)
        - get a number of all experiments that were carried out
        - for each experiment, read header and (based on it)
          every single measurement point that was part of an experiment
        - keep a trace of current position in read lines (offset) to deal with
          different numbers of measurement points in varying experiments
    """
    
    with open(filepath) as raw_data:
        raw_lines = raw_data.readlines()
        lines = transform_raw_lines_to_lists(raw_lines)
        experiments_count = int(float(lines[1][0]))
        offset = 1
        data = []
        
        for _ in range(experiments_count):
            offset += 1
            experiment_header_line = lines[offset]
            experiment_data, offset = read_experiment_data(lines, experiment_header_line, offset)
            data += experiment_data
            
    return pd.DataFrame(data, columns=["atom_number", "beta", "dose", "mean_survivability", "min_survivability", "max_survivability", "error_of_measurement"])


def transform_raw_lines_to_lists(raw_lines: List[str]) -> List:
    """
    Function transforms all row lines from data into lists of splitted values 
    (with removing whitespaces) and returns list of these lists
    """
    
    lines = []
    
    for raw_line in raw_lines:
        line = re.split(r' ', raw_line)
        line = [number for number in line if number != '']
        lines.append(line)
        
    return lines


def read_experiment_data(lines: List, experiment_header_line: List, offset: int) -> (List, int):
    """
    Functions parses experiment coefficients into numeric types and returns measurement points
    as a tupple of list (data referring to points) and current offset
    """
    
    experiment_count = int(float(experiment_header_line[0]))
    atom_number = int(float(experiment_header_line[1]))
    beta = float(experiment_header_line[2])
    
    return read_measurement_points(experiment_count, lines, offset, atom_number, beta)


def read_measurement_points(experiment_count: int, lines: List[str], offset: int, atom_number: int, beta: float) -> (List, int):
    """
    Function collects data from measurement points, assigns valid coefficients from experiment 
    (each measurement point gets the same atom_number and beta factors) to them based on updating offset in lines.
    """
    
    measurements_data = []
    
    for _ in range(experiment_count):
        offset += 1
        line = lines[offset]
        measurement_data = read_measurement_point(line, atom_number, beta)
        measurements_data.append(measurement_data)
        
    return measurements_data, offset


def read_measurement_point(line: List, atom_number: int, beta: float) -> List:
    """
    Function reads single measurement point and returns it as a list.
    
    Meaning of consecutive numbers in list:
    atom_number, beta, dose, mean_survivability, min_survivability, max_survivability, error_of_measurement
    """
    
    return [atom_number, beta] + [float(number) for number in line]

              

In [12]:
dataset = read_dataset("../data/SUR5_Train.DAT")

In [13]:
dataset

Unnamed: 0,atom_number,beta,dose,mean_survivability,min_survivability,max_survivability,error_of_measurement
0,1,0.50827,0.000,100.0,100.0,100.0,1.00
1,1,0.50827,0.500,68.8,60.0,75.5,8.80
2,1,0.50827,1.000,50.0,41.4,57.8,8.60
3,1,0.50827,1.500,42.8,35.1,49.6,7.69
4,1,0.50827,2.400,23.7,22.8,25.0,0.90
...,...,...,...,...,...,...,...
210,26,0.48110,0.796,23.3,20.0,27.0,3.30
211,26,0.48110,1.190,13.3,12.6,16.0,0.70
212,26,0.48110,1.600,6.0,5.7,7.1,0.30
213,26,0.48110,2.390,2.2,2.0,2.6,0.20


In [14]:
features = ['atom_number', 'beta', 'dose']
target = ['mean_survivability']

X = dataset[features]
y = dataset[target]

In [15]:
X

Unnamed: 0,atom_number,beta,dose
0,1,0.50827,0.000
1,1,0.50827,0.500
2,1,0.50827,1.000
3,1,0.50827,1.500
4,1,0.50827,2.400
...,...,...,...
210,26,0.48110,0.796
211,26,0.48110,1.190
212,26,0.48110,1.600
213,26,0.48110,2.390


In [16]:
y

Unnamed: 0,mean_survivability
0,100.0
1,68.8
2,50.0
3,42.8
4,23.7
...,...
210,23.3
211,13.3
212,6.0
213,2.2
