In [10]:
from grmpy.read.read import read
import yaml
import pprint
import numpy as np

In [11]:
%%file example.ini
SIMULATION

    agents                      165
    seed                       8842
    source                     data
    
ESTIMATION

    agents                     165
    file                       data.grmpy.txt
    optimizer                  SCIPY-BFGS
    start                      init
    maxiter                    6383
    dependent                  Y
    indicator                  D
    comparison                 0
    output_file                est.grmpy.info



TREATED

    coeff          x1          1.2303
    coeff          x2         -2.3070

UNTREATED

    coeff          x1         -1.7192
    coeff          x2         -1.6871

CHOICE

    coeff           x1         2.4869
    coeff           x3        -0.3900

DIST

    coeff                    2.1559
    coeff                   -0.2055
    coeff                    1.6975
    coeff                    2.8355
    coeff                   -1.6985
    coeff                    2.8874

SCIPY-BFGS

    gtol                    1.3798690190939666e-05
    eps                     1.4901161193847655e-08

SCIPY-POWELL

    xtol                     9.147777614048603e-05
    ftol 9.749582129043358e-05

Overwriting example.ini


In [13]:
attr_dict = read('example.ini')
pp = pprint.PrettyPrinter()
pp.pprint(attr_dict)

{'AUX': {'init_values': [1.2303,
                         -2.307,
                         -1.7192,
                         -1.6871,
                         2.4869,
                         -0.39,
                         2.1559,
                         -0.2055,
                         1.6975,
                         2.8355,
                         -1.6985,
                         2.8874],
         'num_covars_cost': 2,
         'num_covars_treated': 2,
         'num_covars_untreated': 2,
         'num_paras': 10,
         'types': ['nonbinary', 'nonbinary', 'nonbinary']},
 'CHOICE': {'all': array([ 2.4869, -0.39  ]),
            'order': [1, 3],
            'types': ['nonbinary', 'nonbinary']},
 'DETERMINISTIC': False,
 'DIST': {'all': array([ 2.1559, -0.2055,  1.6975,  2.8355, -1.6985,  2.8874])},
 'ESTIMATION': {'agents': 165,
                'comparison': 0,
                'dependent': 'Y',
                'file': 'data.grmpy.txt',
                'indicator': 'D',
        

In [3]:
# test = {'SIMULATION': {'agents': 165, 'seed': 8842, 'source': 'data'}, 'bla': [1, 2, 3]}
# with open('bla.yaml', 'w') as y:
    # yaml.dump(test, y, default_flow_style=False)

In [4]:
%%file example.yaml
SIMULATION:
    agents: 165
    seed: 8842
    source: data

ESTIMATION:
    agents: 165
    file: data.grmpy.txt
    optimizer: SCIPY-BFGS
    start: init
    maxiter: 6383
    dependent: Y
    indicator: D
    comparison: 0
    output_file: est.grmpy.info

TREATED:
    names: [x1, x2]
    params: [1.2303, -2.3070]

UNTREATED:
    names: [x1, x2]
    params: [-1.7192, -1.6871]

CHOICE:
    names: [x1, x3]
    params: [2.4869, -0.3900]
        
DIST: [2.1559, -0.2055, 1.6975, 2.8355, -1.6985, 2.8874]
    
SCIPY-BFGS:
    gtol: 1.3798690190939666e-05
    eps: 1.4901161193847655e-08

SCIPY-POWELL:
    xtol: 9.147777614048603e-05
    ftol: 9.749582129043358e-05
        
VARTYPES:
    x1: binary


Overwriting example.yaml


In [14]:
with open('example.yaml') as y:
    init_dict = yaml.load(y)
init_dict

{'CHOICE': {'names': ['x1', 'x3'], 'params': [2.4869, -0.39]},
 'DIST': [2.1559, -0.2055, 1.6975, 2.8355, -1.6985, 2.8874],
 'ESTIMATION': {'agents': 165,
  'comparison': 0,
  'dependent': 'Y',
  'file': 'data.grmpy.txt',
  'indicator': 'D',
  'maxiter': 6383,
  'optimizer': 'SCIPY-BFGS',
  'output_file': 'est.grmpy.info',
  'start': 'init'},
 'SCIPY-BFGS': {'eps': 1.4901161193847655e-08, 'gtol': 1.3798690190939666e-05},
 'SCIPY-POWELL': {'ftol': 9.749582129043358e-05,
  'xtol': 9.147777614048603e-05},
 'SIMULATION': {'agents': 165, 'seed': 8842, 'source': 'data'},
 'TREATED': {'names': ['x1', 'x2'], 'params': [1.2303, -2.307]},
 'UNTREATED': {'names': ['x1', 'x2'], 'params': [-1.7192, -1.6871]}}

In [6]:
def init_dict_to_attr_dict(init):
    attr = {}
    for key in ['TREATED', 'UNTREATED', 'CHOICE']:
        attr[key] = {'all': np.array(init[key]['params']),
                     'order': init[key]['names']}
    
    attr['DIST'] = {'all': np.array(init['DIST'])}
    attr['DETERMINISTIC'] = (attr['DIST']['all'] == 0).all()

    for key in ['ESTIMATION', 'SCIPY-BFGS', 'SCIPY-POWELL', 'SIMULATION']:
        attr[key] = init[key]
        
    varnames = []
    for key in ['TREATED', 'UNTREATED', 'CHOICE']:
        for name in init[key]['names']:
            if name not in varnames:
                varnames.append(name)
    attr['varnames'] = varnames
    attr['AUX'] = {'init_values'} 
    
    init_values = []
    for key in ['TREATED', 'UNTREATED', 'CHOICE']:
        init_values += init[key]['params']
    init_values += init['DIST']
    attr['AUX'] = {'init_values': init_values,
                   'num_covars_cost': len(attr['CHOICE']['all']), # is this correct?
                   'num_covars_treated': len(attr['TREATED']['all']),
                   'num_covars_untreated': len(attr['UNTREATED']['all']),
                   'num_paras': len(init_values)} # this is wrong!
    
    return attr

In [15]:
pp = pprint.PrettyPrinter()
attr2 = init_dict_to_attr_dict(init_dict)
pp.pprint(attr2)

{'AUX': {'init_values': [1.2303,
                         -2.307,
                         -1.7192,
                         -1.6871,
                         2.4869,
                         -0.39,
                         2.1559,
                         -0.2055,
                         1.6975,
                         2.8355,
                         -1.6985,
                         2.8874],
         'num_covars_cost': 2,
         'num_covars_treated': 2,
         'num_covars_untreated': 2,
         'num_paras': 12},
 'CHOICE': {'all': array([ 2.4869, -0.39  ]), 'order': ['x1', 'x3']},
 'DETERMINISTIC': False,
 'DIST': {'all': array([ 2.1559, -0.2055,  1.6975,  2.8355, -1.6985,  2.8874])},
 'ESTIMATION': {'agents': 165,
                'comparison': 0,
                'dependent': 'Y',
                'file': 'data.grmpy.txt',
                'indicator': 'D',
                'maxiter': 6383,
                'optimizer': 'SCIPY-BFGS',
                'output_file': 'est.grmpy.info

# TODO

- todo: add `types_` to `AUX` and `TREATED`, `UNTREATED`, `CHOICE`; 
    

# Questions

- Is it possible to determine the `types_` directly from the dataset instead of from the specification file? If so, I would do that because it saves all the checking.
- What is `num_paras`? Why is it not `len(init_values)` but `num_covars_treated + num_covars_untreated + num_covars_cost + 2 + 2`)
- What are the additional columns in coefficient lines? (type_, categories, prob). Could they be determined from the data?
- Is `num_covars_cost` correct?

# Remarks

- use list and tuple unpacking
- use meaningful names:
    - `attr_dict` or `attributes` instead of `dict_`
    - `line_entries` instead of `list_`
    - `for variable in covariates:` instead of `for i in covariates:`
- don't store redudant information (types are stored in several places; type_dict could solve that)
- NEVER! require a user to specify redundant information (types?, coeff)
- Try to integrate with the rest of the Python world (.yaml instead of .ini if there is no good reason for .ini)
- pull the data loading or reading from file out of the function for better testability
- label based is almost always better than position based; Exception: performance critical parts in numerical code

In [9]:
# list unpacking example
line_entries = ['coeff', 'x1', 1.5]
key, name, val = line_entries[0], line_entries[1], line_entries[2]
print(key, name, val)

key_, name_ = line_entries[:2]
print(key, name)

coeff x1 1.5
coeff x1 1.5
