## Generate New Files

* Given a template and a definition of the variables, generate a set of new files

In [27]:
import csv
from pathlib import Path
import re

class TemplateProcessor:
    def __init__(self, template, verbose=False):
        self.template_path = Path(template)
        self._verbose = verbose
        if not self.template_path.exists():
            raise FileNotFoundError(f"Template file '{self.template_path}' not found.")

        #Number and types of parameters per distribution
        self._valid_distributions = {
            'uniform':    (2, ['int','float']),
            'normal':     (2, ['float']),
            'lognormal':  (2, ['float']),
            'triangular': (3, ['float']),
            'constant':   (1, ['int','float','str']),
            'categorical': (2, ['int','float','str'])
        }

        self.variables_raw = self._extract_variables()
        self.variables = self._parse_variables()
        self.output_file_path = None

    def _extract_variables(self):
        variables_raw = []
        with open(self.template_path, 'r') as file:
            for line_num, line in enumerate(file, start=1):
                parts = line.split('<\\var>')
                if len(parts) > 1:
                    for part in parts[1:]:
                        var = part.split('<var>')[0].strip()
                        if var == '':
                            raise ValueError(f"Empty variable name at line {line_num}.")
                        # if self._verbose:
                        #     print(f'Found command {var}.')
                        variables_raw.append(var)
                if line.count('<\\var>') != line.count('<var>'):
                    raise ValueError(f"Unclosed <\\var> <var> at line {line_num}.")
        return variables_raw

    def _extract_contents(self, text, open=r'\(', close=r'\)'):
        pattern = f'{open}(.*?){close}'  # Regular expression pattern to match text between parentheses
        matches = re.findall(pattern, text)  # Find all matches
        return matches

    def _transform_variable(self, variable, variable_type):
        try:
            var_type = eval(variable_type)
            return var_type(variable)
        except (ValueError, TypeError, NameError):
            return None

    def _custom_split(self, text, sep=',', open='{', close='}'):
        result = []
        current_token = ''
        paren_count = 0

        for char in text:
            if char == sep and paren_count == 0:
                result.append(current_token.strip())
                current_token = ''
            elif char == open:
                paren_count += 1
                current_token += char
            elif char == close:
                paren_count -= 1
                current_token += char
            else:
                current_token += char

        result.append(current_token.strip())
        return result

    def _parse_distribution(self, text, var_type):
        if text is None:
            return ('table',)

        params = self._custom_split(text)
        dist_name = params[0]
        if dist_name not in self._valid_distributions:
            raise ValueError(f"Invalid distribution: '{dist_name}'.")
        params = params[1:]
        if len(params) != self._valid_distributions[dist_name][0]:
            raise ValueError(f"Invalid number of parameters for distribution '{dist_name}'. Expected {self._valid_distributions[dist_name][0]}, found {len(params)}.")

        if var_type is not None:
            if var_type not in self._valid_distributions[dist_name][1]:
                raise ValueError(f"Invalid type ({var_type}) for distribution '{dist_name}'. Valid option(s): {', '.join(self._valid_distributions[dist_name][1])}.")
            
            if dist_name == 'categorical':
                param_list = params[0].lstrip('{').rstrip('}').split(',')
            else:
                param_list = params

            for i in range(len(param_list)):
                new_value = self._transform_variable(variable=param_list[i], variable_type=var_type)
                if new_value is None:
                    raise ValueError(f"Parameters for distribution '{dist_name}' must be of type {var_type}. Cannot transform '{param_list[i]}'.")
                param_list[i] = new_value
    
            if dist_name == 'categorical':
                params[0] = param_list
            else:
                params = param_list

        if dist_name == 'categorical':
            param_list = params[1].lstrip('{').rstrip('}').split(',')
            if len(params[0]) != len(param_list):
                raise ValueError(f"Inconsistent number of values in distribution '{dist_name}'. Values found: {len(params[0])}, associated probabilities: {len(param_list)}.")
            for i in range(len(param_list)):
                new_value = self._transform_variable(variable=param_list[i], variable_type='float')
                if new_value is None:
                    raise ValueError(f"Probabilities for distribution '{dist_name}' must be of type float. Cannot transform '{param_list[i]}'.")
                param_list[i] = new_value
            params[1] = param_list

        return (dist_name,) + tuple(params)

    def _parse_variable_options(self, text):
        default = None
        distribution = ('table',)
        var_type = None

        distribution_text = self._extract_contents(text=text, open=r'\(', close=r'\)')
        if len(distribution_text) > 1:
            raise ValueError(f"Bad distribution options format in: '{text}'. Only one distribution with ( and ) can be defined.")

        options = self._custom_split(text=text, open='(', close=')')
        if len(distribution_text) == 1:
            if '(' not in options[-1] or ')' not in options[-1]:
                raise ValueError(f"Distribution options with ( and ) must be the last information in: '{options}'.")
            options = options[:-1]
            distribution_text = distribution_text[0].strip()
        else:
            distribution_text = None

        if len(options) > 2:
            raise ValueError(f"Bad options format in: '{options}'. Too many options.")
        elif len(options) > 0:
            default = options[-1].strip()
            if len(options) == 2:
                var_type = options[0].strip()
                default = self._transform_variable(variable=default, variable_type=var_type)
                if default is None:
                    raise ValueError(f"Default value ({options[-1]}) must be of the defined type ({var_type}).")

        distribution = self._parse_distribution(distribution_text, var_type)

        return {'active': True, 'distribution': distribution, 'default': default}

    def _parse_variable(self, text):
        # General pattern: Variable[type, default, (distribution, par1, par2)],
        key = text.split('[')[0].strip()
        if len(key) == 0:
            raise ValueError(f"Undefined variable name in: '{text}'.")

        options = self._extract_contents(text=text, open=r'\[', close=r'\]')
        if len(options) > 1:
            raise ValueError(f"Bad options format in: '{text}'. Only one list with [ and ] can be defined.")

        options_dict = {'active': True, 'distribution': ('table',), 'default': None}
        if len(options) == 1:
            options_dict = self._parse_variable_options(options[0])

        return key, options_dict

    def _parse_variables(self):
        variables = {}
        for var in self.variables_raw:
            if self._verbose:
                print(f'Command found: {var}')
            key, options = self._parse_variable(var)
            variables[key] = options
            if self._verbose:
                print(f'  key: {key}')
                print(f'  options: {options}')
        return variables

    def set_output_file(self, output_file_path):
        self.output_file_path = Path(output_file_path)

    def set_variables_table(self, csv_path):
        if not Path(csv_path).exists():
            print(f"CSV file '{csv_path}' not found.")
        else:
            with open(csv_path, 'r') as csvfile:
                reader = csv.reader(csvfile)
                header = next(reader)
                for key in header:
                    if key not in self.variables:
                        print(f"Variable '{key}' not found in template, ignoring data.")
                        continue
                    if self.variables[key]['distribution'][0] != 'table':
                        print(f"Variable '{key}' already has a distribution, ignoring data.")
                        continue
                    values = []
                    for row in reader:
                        values.append(row[header.index(key)])
                    self.variables[key]['values'] = values

template = TemplateProcessor(template=r'..\template\no_errors.dat',
                             verbose=True)


Command found: No_Default
  key: No_Default
  options: {'active': True, 'distribution': ('table',), 'default': None}
Command found: With_Default_12[12]
  key: With_Default_12
  options: {'active': True, 'distribution': ('table',), 'default': '12'}
Command found: With_Type_int[int,42]
  key: With_Type_int
  options: {'active': True, 'distribution': ('table',), 'default': 42}
Command found: normal_variable[float,0.5,(normal,0, 2.5)]
  key: normal_variable
  options: {'active': True, 'distribution': ('normal', 0.0, 2.5), 'default': 0.5}
Command found: normal_variable_no_type[0.5,(normal,0, 2.5)]
  key: normal_variable_no_type
  options: {'active': True, 'distribution': ('normal', '0', '2.5'), 'default': '0.5'}
Command found: List_int[int,3,(categorical,{1,3,7,-12},{0.2,0.15,0.25,0.400})]
  key: List_int
  options: {'active': True, 'distribution': ('categorical', [1, 3, 7, -12], [0.2, 0.15, 0.25, 0.4]), 'default': 3}
Command found: List_float[float,7.,(categorical,{1,3,7,-12},{0.2,0.15,0.2