In [1]:
from lxml import etree
import pandas as pd
import networkx as nx
import re
import os
import base64 # to extract images from base64 strings (as they are stored in xml files)
from datetime import datetime
from itertools import combinations
from numpy import linspace

import graphtools as gt
from loadcalculations import loadcalc
import cleanhtml as ch #my own helper functions
from combinecalculates import calcombo
from treetodataframe import treetodataframe
import caretaker_advice as ca
import qualitychecks_pd as qcpd
import multiheadlinesplit as mhsplit
import odk_helpers as oh

In [2]:
# Load parameters (defined in the merge script)

In [3]:
%store -r form_id testing multiple_labels summaryfile drugsfile cafile inputfile_dx inputfile_tt \
dxfile ttfile output form_title input_trans updated_trans diagnose_order

In [4]:
treatment_flow=True

### Parsing draw.io file

In [5]:
data = etree.parse(inputfile_tt) # 'data' is a wrapper for the entire tree
root = data.getroot() # get the name of the highest element of the tree, put it into the variable 'root'
pages = root.findall('.//diagram') # gets all the tabs of the document

objects = [] # all objects of all pages combined

for page in pages:
    print('Page ID:', page.attrib['id'], 'Page name:', page.attrib['name'])
    objects_in_page = page.findall('.//mxGraphModel//mxCell')
    objects = objects + objects_in_page

OSError: Error reading file '../forms-clinical/ped/release20221128/ped_tt.drawio': failed to load external entity "../forms-clinical/ped/release20221128/ped_tt.drawio"

In [None]:
df_raw = treetodataframe(objects)

# maintain compatibility with old script:
df_raw.fillna('', inplace = True)
df_raw['tag']=''
df_raw.loc[df_raw['label']!='','value'] = df_raw['label']
df_raw['xml-parent']=df_raw['parent']

df_raw = df_raw[['tag', 'id', 'value', 'style', 'xml-parent',
      'source', 'target', 'name', 'odk_type', 'min', 'max', 'required',
      'constraint_message', 'x', 'y']]

### Quality checks

In [None]:
qcpd.check_node_type(df_raw) # check if all objects have an odk_type
qcpd.check_rhombus_refer(df_raw) # check if all rhombus refer to an existing node
qcpd.check_edge_connection(df_raw) # check if all edges are well connected
types = ['rhombus', 'select_one yesno']
qcpd.check_edge_yesno(df_raw, types) # check if all edges leaving rhombus and select_one yesno have Yes/No

### Pause function

In [None]:
# Identify break points for the PAUSE function
df_pause = df_raw.loc[df_raw['style'].str.contains('fillColor=#cdeb8b', na=False),['id', 'name', 'odk_type']]
df_pause['flowtype'] = 'treatment'
df_pause = df_pause[['id', 'name', 'odk_type', 'flowtype']]
df_pause.to_csv('breakpoints.csv', mode='a', index=False, header=False)

### Constraint column

In [None]:
# make a constraint column
df=df_raw.copy()
df.drop(columns=['x','y'],inplace=True)
df['constraint']=''
df.loc[df['min']!='','constraint']='.>=' + df['min']
df.loc[df['max']!='','constraint']=df['constraint'] + ' and .<=' + df['max']
df.drop(columns=['min','max'],inplace=True)

### Required fields
if integers and decimals are not REQUIRED, the expression towards the downstream fields must be removed. See below under **Expression for each row**

In [None]:
df.loc[df['required']=='yes','required']='true()'

### Clean html

In [None]:
# remove html formatting everywhere, except in 'note' and 'help-messages' (not allowed there in CHT)
m = ~df['odk_type'].isin(['note','help-message'])
df.loc[m,'value'] = df.loc[m,'value'].apply(lambda x: ch.remove_html(x) if x!=None else None)

# clean html in 'note' and 'help-messages'
m = df['odk_type'].isin(['note','help-message'])
df['value'] = df['value'].apply(lambda x: ch.remove_html_value(x) if x!=None else None)

# only for somalia: remove all html and split notes with multiple headings
if form_id == 'almsom':
    m = df['odk_type']=='note'
    df.loc[m,'value'] = df.loc[m,'value'].apply(lambda x: ch.clean_multi_headings(x))    

### for Somalia: split multiheadline nodes into singletons

In [None]:
if form_id == 'almsom':
    df = mhsplit.split_mh(df)
    
    # get rid of junk characters around the heading like 'TT Box «'
    df['value'] = [re.search('(?<=«).*?(?=»)',i).group(0) if re.search('(?<=«).*?(?=»)',i)!=None \
               else i for i in df['value']]
    
    # clean all html from the entire df
    df['value'] = df['value'].apply(lambda x: ch.html2plain(x))

### Give name to diagnoses

In [None]:
# properly name the diagnose calculates in the TT drawing
diagnose_hierarchy = pd.read_csv(diagnose_order)

diagnose_hierarchy['map']= diagnose_hierarchy['Name'].apply(ch.clean_name) 
df['map'] = df['value'].astype(str)
df['map'] = df['map'].apply(ch.clean_name)

m = df['map'].isin(diagnose_hierarchy['map']) & (df['odk_type']=='calculate')
dfa = df.loc[m].reset_index()
dfa = dfa[['index','map']].merge(diagnose_hierarchy[['id','map']],how='left',on='map')
dfa.set_index('index',inplace=True)
dfa.rename(columns={'id':'name'},inplace=True)
df.update(dfa)

df.drop(columns=['map'],inplace=True)

In [None]:
# get diagnose hierarchy expressed as id's
dfa = df.loc[m,['id','name']].set_index('name') # slice of df containing the diagnoses (unsorted)
dfa = dfa.reindex(list(diagnose_hierarchy['id']))['id'] # that slice, but sorted and 'nan' dropped 
# (these are diagnoses that exist in dx but not in tt, mostly non-severe ones that have no TT)
diagnose_id_hierarchy = list(dfa.dropna())

In [None]:
if len(df.loc[(df['name']=='d_') & (df['odk_type']=='calculate')])>0:
    print('There were unmatched diagnose names!')

### Resolve name duplicates

In [None]:
# Duplicate names are not allowed except for rhombus, calculates (which will be combined later) and select_options 
# THIS IS BAD PRACTICE!
#m = df.duplicated(subset=['name'],keep=False) & ~df['odk_type'].isin(['calculate', 'rhombus', 'select_option'])
#df.loc[m, 'name']=df['name']+df['id']

df.set_index('id',inplace=True)

### Make dataframe with edges

In [None]:
df_arrows=df.loc[(df['source']!='') & (df['target']!=''),['source','target','value']]

# drop arrows from df
df.drop(df_arrows.index,inplace=True)

### take into account shortcuts

In [None]:
# take into account shortcuts
dfa = df_raw.loc[df_raw['odk_type']=='goto'].copy() # extract shortcut elements and put in a new dataframe
dfa.loc[dfa['odk_type']=='goto','name'] = dfa.loc[dfa['odk_type']=='goto','name'].str[9:] # remove prefix
# merge with raw-data to get the id of the exit element
dfa = dfa.reset_index().merge(df_raw.reset_index()[['id','name']],how = 'left', on='name') 
exitmap = dict(zip(dfa['id_x'],dfa['id_y'])) # convert into a dictionnary 
df_arrows['target'] = df_arrows['target'].replace(exitmap) # replace the shortcut elements by the exit-element in df_arrows
df.drop(df.loc[df['odk_type']=='goto'].index,inplace=True) # drop shortcuts from df_survey

### extract images

In [None]:
# creating a folder for images and other media

if not(os.path.isdir('media')): # check if it exists, because if it does, error will be raised 
    # (later change to make folder complaint to CHT)
    os.mkdir('media')

In [None]:
# finding png images that belong to container-hint-media (not included are those that belong to select_options)
m = df['style'].str.contains("image/png",na=False)
df.loc[m,'odk_type']='png-image'+df.loc[m,'name']+'.png'

# getting a dataframe with png-images only (better for joining with df later)
# images:rows where 'xml-parent' is inside the index of rows that have the entry 'container_hint_media' in odk_type column, 
# of those rows we extract those where the 'type' column contains the substring 'png-image'
# and of the result we just take the columns 'xml-parent', 'odk_type' and 'style'
# 'xml-parent' is the container it belongs to and the line that will contain the info about the image
# 'odk_type' contains also the file name .png
# 'style' contains the actual image data

df_png=df.loc[df['xml-parent'].isin(df.loc[df['odk_type']=='container_hint_media'].index) 
              & df['odk_type'].str.contains('png-image',na=False),
              ['xml-parent','odk_type','style']] # images that are in 'containers_hint_media'

# getting image data from 'style' column for all images (from containers AND select_options) and storing it to disk
df_pngAll=df.loc[df['odk_type'].str.contains('png-image',na=False),['xml-parent','odk_type','style']]
for index, row in df_pngAll.iterrows():
    string = row['style'] 
    img_data=re.search('image/png,(.+?);',string).group(1) # extract image data from 'style' column using regex
    with open('media/'+row['odk_type'], "wb") as fh:
        fh.write(base64.decodebytes(img_data.encode('ascii'))) # encode image into ascii (binary) and save

df_png.rename({'xml-parent':'container_id','odk_type':'image::en'},axis=1,inplace=True)
index_delete=df_png.index
df_png.set_index('container_id',inplace=True)
df_png.drop('style',axis=1,inplace=True)

# joinging df and df_png (this adds the media-image column to df)
df=df.join(df_png)

# remove the rows with those 'png messages' in df as they are no longer needed
df.drop(index_delete,inplace=True)

In [None]:
# finding jpeg images that belong to container-hint-media (not included are those that belong to select_options)
df.loc[df['style'].str.contains("image/jpeg",na=False),'odk_type']='jpeg-image'+df.name+'.jpeg'

# getting a dataframe with png-images only (better for joining with df later)
# images:rows where 'xml-parent' is inside the index of rows that have the entry 'container_hint_media' in odk_type column, 
# of those rows we extract those where the 'type' column contains the substring 'png-image'
# and of the result we just take the columns 'xml-parent', 'odk_type' and 'style'
# 'xml-parent' is the container it belongs to and the line that will contain the info about the image
# 'odk_type' contains also the file name .png
# 'style' contains the actual image data

df_png=df.loc[df['xml-parent'].isin(df.loc[df['odk_type']=='container_hint_media'].index) 
              & df['odk_type'].str.contains('jpeg-image',na=False),
              ['xml-parent','odk_type','style']] # images that are in 'containers_hint_media'

# getting image data from 'style' column for all images (from containers AND select_options) and storing it to disk
df_pngAll=df.loc[df['odk_type'].str.contains('jpeg-image',na=False),['xml-parent','odk_type','style']]
for index, row in df_pngAll.iterrows():
    string = row['style'] 
    img_data=re.search('image/jpeg,(.+?);',string).group(1) # extract image data from 'style' column using regex
    with open('media/'+row['odk_type'], "wb") as fh:
        fh.write(base64.decodebytes(img_data.encode('ascii'))) # encode image into ascii (binary) and save

df_png.rename({'xml-parent':'container_id','odk_type':'image::en'},axis=1,inplace=True)
index_delete=df_png.index
df_png.set_index('container_id',inplace=True)
df_png.drop('style',axis=1,inplace=True)

# joinging df and df_png (this adds the media-image column to df)
df.update(df_png)

# remove the rows with those 'png messages' in df as they are no longer needed
df.drop(index_delete,inplace=True)

### Create and populate 'help' & 'hint' columns

In [None]:
for s in ['hint-message', 'help-message']:

    dfa=df_raw.loc[df_raw['odk_type']==s,['xml-parent','value']] # dataframe with help-fields / hint-fields only
    drop_index = df_raw.loc[df_raw['odk_type']==s, 'id']
    dfa.set_index('xml-parent', inplace = True) # in order to join dfa and df on index
    sa = s[:-8]+'::en'
    dfa.rename(columns = {'value':sa}, inplace = True) 
    df=df.join(dfa) # this adds the help message column to df
    df.drop(drop_index, inplace = True) # remove 'help' rows from df (that data is now in the 'help' column)
    
df.fillna('', inplace = True)

In [None]:
# make a dataframe that will be needed later to replace sources in df_arrows which are inside a container, by the container itself

df_new_arrow_sources = df.loc[df['xml-parent'].isin(df.loc[df.odk_type=='container_hint_media'].index) 
                              | df['xml-parent'].isin(df.loc[df.odk_type=='container_page'].index),['xml-parent','odk_type']]
df_new_arrow_sources.rename({'xml-parent':'container_id','odk_type':'odk_type_of_content'},axis=1,inplace=True)

# add also the type of the container (page or hint-image)
df_new_arrow_sources = df_new_arrow_sources.merge(df[['odk_type']],how='left',left_on='container_id',right_index=True)

### replace 'container_hint_media' labels with those of their children & drop children from df

In [None]:
container_ids = df_raw[df_raw['odk_type']=='container_hint_media']['id']
m = df_raw['xml-parent'].isin(container_ids) & ~df_raw['style'].str.contains('image',na=False) & ~df_raw['odk_type'].isin(['hint-message', 'help-message'])
label_ids = list(df_raw[m]['id']) # used for dropping the labels from df after uploading info to container rows
df_label = df_raw.loc[m, ['xml-parent','value','odk_type','name', 'id']] # all the label-children of containers 
df_label.set_index('xml-parent', inplace=True)
# ATTENTION! df_raw still has duplicate names -> duplicates in df_label['name'], so fix it now:
df_label.loc[df_label.duplicated(subset = ['name']), 'name'] = df_label['name'] + df_label['id']
df.update(df_label) # update the containers' 'value', 'odk_type' and 'name'

df.drop(label_ids, inplace = True) # drop the children from df

In [None]:
# for connectors where the source is inside a container-hint-media, replace the source with the container itself
df_hint_media_objects = df_new_arrow_sources.loc[df_new_arrow_sources['odk_type']=='container_hint_media']
df_arrows = df_arrows.merge(df_hint_media_objects,how='left',left_on='source',right_index=True)
df_arrows.rename(columns={'odk_type':'container_type'},inplace=True)
m=(df_arrows['container_type']=='container_hint_media')
df_arrows.loc[m,'source']=df_arrows.loc[m,'container_id']
df_arrows.loc[m,'source_type']=df_arrows.loc[m,'odk_type_of_content']
df_arrows.drop(columns=['container_id','odk_type_of_content','container_type'],inplace=True)
df_arrows.fillna('',inplace=True)

In [None]:
# making a dataframe with all choice options for all valueSets (choices tab)
# all elements whose 'xml-parent' is the 'id' of elements that have 'select_xxx' in type
# these are all options (elements of valuesets)
df_choices=df.loc[df['odk_type']=='select_option']
df_choices=df_choices.merge(df[['name','odk_type']],how='left',left_on='xml-parent',right_index=True)
df.drop(index_delete,inplace=True) # drop the labels from df (now you can, cause you have them in df_choices)
df_choices=df_choices[['name_y','name_x','value','odk_type_y']]

# info: the 'odk_type' is kept because it will be necessary for making the logic (relevance column)
df_choices.rename({'name_y':'list_name','name_x':'name','value':'label::en','odk_type_y':'odk_type'},axis=1,inplace=True)

# remove the rows with 'choices' in df as they are no longer needed
df.drop(df_choices.index,inplace=True)

# make a dataframe that contains only remaining image objects (those that belong to options)
df_png = df.loc[df['odk_type'].str.contains('-image',na=False),'odk_type'].to_frame()
# drop the select_option images from df
df.drop(df_png.index)
# merge with df_arrows to add the 
df_png = df_png.merge(df_arrows[['source','target']],how='left',left_index=True,right_on='source')
df_png.rename(columns={'odk_type':'image::en'},inplace=True)
# add the image name to df_choices
df_choices = df_choices.reset_index().merge(df_png[['image::en','target']],\
                                            how='left',left_on='id',right_on='target').set_index('id')
# drop the target column
df_choices.drop(columns=['target'],inplace=True)

# drop the remaining unspecified objects (pure xml formating related elements or drawing artefacts) 
df.drop(df.loc[df.value==''].index,inplace=True)

# add rows for yesno
yes=['yesno','Yes','Yes','select_one','']
no=['yesno','No','No','select_one','']
df_choices.loc['zzz_yes']=yes
df_choices.loc['zzz_no']=no

In [None]:
# preparing df_arrows for logic part:

# rename index of df_arrows to reduce confusion
df_arrows.index.rename('Arrow ID',inplace=True)

# make a logical expression for each arrow

# add names of the source from df (for the case when the source is NOT a select_xxx) (names are the odk id's)
# the value is only needed for the rhombus

'''
First we merge with df and then again with df_choices. The reason for that: at this stage, 
the arrows originate from select_xxx options (opt1,opt2,...), but do not point to them. 
However, at a later stage, those arrows are modified so they originate from the select_xxx itself. If that step was done 
before, we would not need to have to merge twice here. When improving the form builder, consider changing this. 
'''
# merging with df to get the odk_type
df_arrows=df_arrows.merge(df[['name','odk_type']],how='left',left_on='source',right_index=True)
# moving the type of the source into the column 'source_type'
df_arrows.loc[df_arrows['source_type']=='','source_type']=df_arrows.loc[df_arrows['source_type']=='','odk_type']
# droping the 'odk_type' column, it is no longer needed
df_arrows.drop(columns=['odk_type'],inplace=True)
df_arrows.fillna('',inplace=True)

# merging with df_choices to get the odk_type for when the source is a select_xxx
df_arrows=df_arrows.merge(df_choices[['list_name','name','odk_type']],how='left',left_on='source',right_index=True)
# as before for df, moving the type of the source into the column 'source_type'
df_arrows.loc[df_arrows['source_type']=='','source_type']=df_arrows.loc[df_arrows['source_type']=='','odk_type']
df_arrows.fillna('',inplace=True)

# merge names from df and df_choices into one column
df_arrows['source_name']=df_arrows['name_x']+df_arrows['list_name']
df_arrows.drop(['name_x','list_name','odk_type'],axis=1,inplace=True)
df_arrows.rename(columns={'name_y':'select_option'},inplace=True)

### Expression for each row

In [None]:
df_arrows['expression']=''

# add connectors to virtual objects (loaded objects)

# expression for yes no questions
df_arrows.loc[df_arrows['source_type']=='select_one yesno','expression'] = '${'+df_arrows['source_name'] + '}=' + '\'' + df_arrows.value + '\''

# expression for integers and decimals
#df_arrows.loc[(df_arrows['source_type']=='integer') | (df_arrows['source_type']=='decimal'),'expression'] = '${'+df_arrows['source_name'] + '}!=' + '\'\''
# for integers and decimals that are NOT required, the expression must be removed:
#m1 = df_arrows['source_type'].isin(['integer', 'decimal'])
#m2 = df_raw['odk_type'].isin(['integer', 'decimal'])
#df_arrow_int = df_arrows.loc[m1].reset_index().merge(df_raw.loc[m2, ['name', 'required']], how = 'left', left_on = 'source_name', right_on = 'name').set_index('Arrow ID')
# merge with df_raw to get the 'required'
#rowIDs = df_arrow_int.loc[df_arrow_int['required']=='no'].index
#df_arrows.loc[rowIDs, 'expression']=''

# expression for text-entry fields
df_arrows.loc[df_arrows['source_type']=='text','expression'] = '${'+df_arrows['source_name'] + '}!=' + '\'\''

# expression for all the other select_one
df_arrows.loc[df_arrows['source_type']=='select_one','expression'] = '${'+df_arrows['source_name'] + '}=' + '\'' + df_arrows['select_option'] + '\''

# expression for select_multiple
df_arrows.loc[df_arrows['source_type']=='select_multiple','expression'] = 'selected(${'+df_arrows['source_name'] + '},\'' + df_arrows['select_option'] + '\')'

# expression for source being a calculate
df_arrows.loc[df_arrows['source_type']=='calculate','expression'] = '${'+df_arrows['source_name'] + '}=1'

In [None]:
# expression for target being a count---> in this case the expression depends not on the source but on the target!
counters=df.loc[df['odk_type']=='count'].index
m = df_arrows['target'].isin(df.loc[df['odk_type']=='count'].index) # mask for connectors that point to 'count' objects
df_arrows.loc[m,'expression'] = 'number(' + df_arrows.loc[m,'expression'] + ')'

# add arrow weight to counter
m = df_arrows['value'].isin(['1','2','3']) & (df_arrows['target'].isin(df.loc[df['odk_type']=='count'].index))
df_arrows.loc[m,'expression'] =  df_arrows.loc[m,'value'] + ' * ' + df_arrows.loc[m,'expression']

# for counters you must combine the expression of all icoming arrows into the one expression of that counter. 
# from there on, a rhombus, referring to a counter can lookup the entire expression

### Expression for rhombus

In [None]:
m = df_arrows['source_type']=='rhombus'
dfa = df_arrows.loc[m].copy()
# remove prefix 'stored_'
# ATTENTION! There is a BUG in pandas, replace(.... inplace = True) is not working!
dfa.loc[m, 'source_name'] = dfa.loc[m, 'source_name'].replace(r'^stored_', r'', regex = True)


# look up the odk_type that the rhombus is refering to
dfa = dfa.reset_index().merge(df[['odk_type','name']],how='left',left_on='source_name',right_on='name').set_index('Arrow ID')
# get rid of the 'name' column (was just needed for merging) and rename 'odk_type' column, to avoid confusion
dfa.drop('name',axis=1,inplace=True)
dfa.rename(columns={'odk_type':'rhombus_refer_to_odk_type'},inplace=True)

# look up the value of the rhombus, it contains info about the logic
dfa = dfa.merge(df[['value']],how='left',left_on='source',right_index=True)
dfa.rename(columns={'value_x':'value','value_y':'value_of_rhombus'},inplace=True)

# set all 'NaN' to empty strings
df_arrows=df_arrows.fillna('')
df_arrows['rhombus_refer_to_odk_type']=''
df_arrows['value_of_rhombus']=''

df_arrows.update(dfa)

In [None]:
# when rhombus refers to a an integer or decimal
m = (df_arrows['source_type']=='rhombus') & (df_arrows['rhombus_refer_to_odk_type'].isin(['integer','decimal']))
# only keep what comes after <,= or >
df_arrows.loc[m,'value_of_rhombus'] = df_arrows.loc[m,'value_of_rhombus'].str.replace(r'^[^<=>]+','',regex=True)
# remove the '?' at the end
df_arrows.loc[m,'value_of_rhombus'] = df_arrows.loc[m,'value_of_rhombus'].str.replace('?','',regex=False)
df_arrows.loc[m,'expression'] = '${'+df_arrows['source_name'] + '}' + df_arrows['value_of_rhombus']
df_arrows.loc[m & (df_arrows['value']=='No')] = df_arrows.loc[m & (df_arrows['value']=='No')].replace({'<=':'>','>=':'<','<':'>=','>':'<='},regex=True)

# when rhombus refers to a select_one yesno
m = (df_arrows['source_type']=='rhombus') & (df_arrows['rhombus_refer_to_odk_type']=='select_one yesno')
df_arrows.loc[m,'expression'] = '${'+df_arrows['source_name'] + '}=' + '\'' + df_arrows.value + '\''

# now the real select_ones:
m = (df_arrows['source_type']=='rhombus') & df_arrows['rhombus_refer_to_odk_type'].isin(['select_one', 'select_multiple'])
df_arrows.loc[m,'value_of_rhombus'] = df_arrows.loc[m,'value_of_rhombus'].str.extract(r'\[(.*?)\]',expand=False)
# merge again with df_choices to get the 'name' of the selected option (also needed for select_multiple!)
df_arrows = df_arrows.reset_index().merge(df_choices[['list_name','name','label::en']], \
                how='left',left_on=['source_name','value_of_rhombus'],right_on=['list_name','label::en']).set_index('Arrow ID')
# when the outgoing arrow is YES (means that what is in RHOMBUS is TRUE)
df_arrows.loc[m & (df_arrows['value']=='Yes'),'expression'] =  '${'+df_arrows['source_name'] + '}=' + '\'' + df_arrows['name'] + '\''
# when the outgoing arrow is NO (means that what is in RHOMBUS is FALSE)
df_arrows.loc[m & (df_arrows['value']=='No'),'expression'] =  '${'+df_arrows['source_name'] + '}!=' + '\'' + df_arrows['name'] + '\''

# when rhombus refers to select_multiple
# when the outgoing arrow is YES (means that what is in RHOMBUS is TRUE)
df_arrows.loc[m & (df_arrows['value']=='Yes'),'expression'] = 'selected(${'+df_arrows['source_name'] + '},\'' + df_arrows['name'] + '\')'
# when the outgoing arrow is NO (means that what is in RHOMBUS is FALSE)
df_arrows.loc[m & (df_arrows['value']=='No'),'expression'] = 'not(selected(${'+df_arrows['source_name'] + '},\'' + df_arrows['name'] + '\'))'

In [None]:
# when rhombus refers to calculate
m = (df_arrows['source_type']=='rhombus') & (df_arrows['rhombus_refer_to_odk_type']=='calculate')
# when the outgoing arrow is YES (means that what is in RHOMBUS is TRUE)
df_arrows.loc[m & (df_arrows['value']=='Yes'),'expression'] = '${'+df_arrows['source_name'] + '}=1'
# when the outgoing arrow is NO (means that what is in RHOMBUS is False)
df_arrows.loc[m & (df_arrows['value']=='No'),'expression'] = '${'+df_arrows['source_name'] + '}=0'

In [None]:
# when rhombus refers to a count (in this case we must combine all 'expressions' of the incoming arrows into the count object 
# with ' + ') and put the result into the 'expression' of the rhombus that is refering to it
m = (df_arrows['source_type']=='rhombus') & (df_arrows['rhombus_refer_to_odk_type']=='count')
df_arrows.loc[m,'value_of_rhombus'] = df_arrows.loc[m,'value_of_rhombus'].str.replace(r'^[^<=>]+','',regex=True) # only keep what comes after <,= or >
df_arrows.loc[m,'value_of_rhombus'] = df_arrows.loc[m,'value_of_rhombus'].str.replace('?','',regex=False) # remove the '?' at the end

# new mask to get the df_arrows of all connectors that point to counters
m1 = df_arrows['target'].isin(df.loc[df['odk_type']=='count'].index) # mask for connectors that point to 'count' objects
gk = df_arrows.loc[m1].groupby('target') # group them by counters

for elem, group in gk:
    # for each counter (elem), combine the expressions of all incoming arrows into a single one, concatenated with +
    full_expression=' + '.join(filter(None,group['expression']))
    # put result into brackets, because comparison is executed BEFORE +
    full_expression = '(' + full_expression + ')'
    
    # lookup the 'name' of the counter in df, based on the id = target
    counter_name = df.loc[elem,'name']
    
    # check in df_arrows where the source_name is 'counter_name'
    # for the 'No' arrow we invert >, < and = of 'value of rhombus'
    m2 = (df_arrows['source_name']==counter_name) & (df_arrows['value']=='No')
    df_arrows.loc[m & m2,'value_of_rhombus'] = df_arrows.loc[m & m2,'value_of_rhombus'].replace({'<=':'>','>=':'<','<':'>=','>':'<=','=':'!=','!=':'='},regex=True)
    df_arrows.loc[m & (df_arrows['source_name']==counter_name),'expression'] = full_expression + df_arrows['value_of_rhombus']

In [None]:
# also drop the arrows that point to counters
df_arrows = df_arrows[df_arrows['target'].isin(df.loc[df['odk_type']!='count'].index)]

# drop no longer necessary columns
df_arrows.drop(columns=['value','value_of_rhombus','source_name','rhombus_refer_to_odk_type','list_name','label::en','name'],inplace=True)

# also drop count objects from df, they are no longer needed
df.drop(df[df['odk_type']=='count'].index,inplace=True)

In [None]:
'''A rhombus can refer to a field that is not in the drawing. For instance, in the TT flow, where values like fever are used
but not calculated. Or in CHT, when patient info or hospital info is loaded into the input section. 
For this, the symbols are drawn in the beginning of the flow, pointing to the note field 'Load Data'. 
Once this is done, it is handled correctly by the script and they get included. '''

### Change sources that are 'select_options' to the 'select_xxx' itself

In [None]:
# get the select_xxx for each select_option:
dfa = df_raw.loc[df_raw['odk_type']=='select_option',['id', 'xml-parent']]
# some select_xxx are in a container-hint-media, their ids have been replaced with the ids of the containers
# therefore lookup the xml-parent of the select_xxx:
dfa = dfa.merge(df_raw[['id', 'xml-parent']], how = 'left', left_on='xml-parent', right_on = 'id', suffixes=('', '_y'))
# and if it is a container-hint-media, replace the 'xml-parent' of the select_option with the id of the container
container_ids = list(df_raw.loc[df_raw['odk_type']=='container_hint_media', 'id'])
m = dfa['xml-parent_y'].isin(container_ids)
dfa.loc[m, 'xml-parent'] = dfa.loc[m, 'xml-parent_y']

# make a dictionnary for replacing sources in df_arrows
d = dict(zip(dfa.iloc[:,0], dfa.iloc[:,1]))
df_arrows['source'].replace(d, inplace = True) # replace

In [None]:
# for connectors where the source is inside a container-hint-media, replace the source with the container itself
df_arrows = df_arrows.merge(df_new_arrow_sources,how='left',left_on='source',right_index=True)
df_arrows.fillna('',inplace=True)
df_arrows.rename(columns={'odk_type':'container_type'},inplace=True)
m=(df_arrows['container_type']=='container_hint_media')
df_arrows.loc[m,'source']=df_arrows.loc[m,'container_id']
df_arrows.loc[m,'source_type']=df_arrows.loc[m,'odk_type_of_content']

In [None]:
# get container_ids of pages
container_ids = df_arrows.loc[df_arrows['container_type']=='container_page','container_id'].unique()

# the ids of objects which are inside the page - containers
page_objects = df.loc[df['xml-parent'].isin(container_ids)].index

# get those page_objects which are the starting point of the flow INSIDE the page
page_starts = page_objects[~page_objects.isin(df_arrows['target'])]

# get the page_starts that are a rhombus (needed for later)
page_starts_rhombus = df.loc[page_starts].loc[df['odk_type']=='rhombus'].index

# get the page_objects where all objects in a single page are notes (needed for later)

# get page_start - container_id pairs
dfnew_connectors = df.loc[page_starts,['xml-parent']].reset_index().rename(columns={'id':'target','xml-parent':'source'})

# add missing columns
dfnew_connectors = dfnew_connectors.reindex(columns=['source','target','source_type','expression','container_id','container_type'])
dfnew_connectors['source_type']='page'
dfnew_connectors.fillna('',inplace=True)

# concat that to df_arrows
df_arrows = pd.concat([df_arrows,dfnew_connectors])

# adding 'target_type' to df_arrows
df_arrows = df_arrows.merge(df['odk_type'],how='left',left_on='target',right_index=True)
df_arrows.rename(columns={'odk_type':'target_type'},inplace=True)

### Build DAG

In [None]:
# make a directed graph 
dag = nx.from_pandas_edgelist(df_arrows, source='source', target='target', create_using=nx.DiGraph)

In [None]:
# check if the new graph is a DAG -> should evaluate to True
if not nx.is_directed_acyclic_graph(dag):
    print('Your graph has loops. Please open them and repeat')

### Build relevance

In [None]:
# This is necessary because there are pages that contain 'note' fields only. 
# In this case notes that point ouf of the page, have no 'expression'. This interrupts the flow. 
# The solution is to give those 'notes' as expression the 'relevant' of the page

df_pageObjects = df.loc[df['xml-parent'].isin(df.loc[df['odk_type']=='container_page'].index)]

# get ids of pages that ONLY contain 'notes'
pure_note_pages=[]
gk = df_pageObjects.groupby('xml-parent')
for elem,frame in gk: 
    if len(frame.index) == len(frame.loc[frame['odk_type']=='note']):
        pure_note_pages.append(elem)

# get all the 'notes' that point out pages:
df_notes_out_pages = df_arrows.loc[df_arrows['source'].isin(df_pageObjects.index) & \
                                    ~df_arrows['target'].isin(df_pageObjects.index) & (df_arrows['source_type']=='note')]

# among those get those notes that belong to 'pure_note_pages' - these are the notes you are looking for
df_notes_outof_pure_notes_pages = df_notes_out_pages.loc[df_notes_out_pages['container_id'].isin(pure_note_pages)]
df_notes_outof_pure_notes_pages = df_notes_outof_pure_notes_pages[['source','container_id']]
df_notes_outof_pure_notes_pages.set_index('source',inplace=True)

In [None]:
# sort the objects 
# this is not the correct sorting for the flow, but any topological sort is good for making 'relevance'
# the real sorting for the form is done later, after nodes have been combined
node_hierarchy=list(nx.lexicographical_topological_sort(dag)) 

In [None]:
df['relevant']=''

Building the logic: 
1. It must be done for each object independently, not for all at once, so there is a for loop
2. Start on the very top and go down the tree. This is the reason why we have topologically sorted df in the previous step
3. For each object lookup all sources in df_arrows (get all rows from df_arrows where the object is the target). 
4. Each source -> target arrow has a logic expression and the entire 'relevant' of the target is just the logic expressions of all incoming arrows, combined with a OR. 
5. A particular attention must be paid when a source is a 'note'. For those sources the 'expression' is empty. That is because there is no decision taken for those objects. A note is just an info to the user and forward to the next field. There is also only one arrow coming out from a note. In this case we must use the relevant of the 'note' and 'calculate' source itself as the expression of note -> target. This would also be the case for 'calculate' objects, but their 'expression' has been populated already. If we do not do that, then the target would pop up independently of the 'note/calculate' condition. That would be wrong. Therefore, in df_source, the 'expression' for 'note' and 'calculate' is the 'relevant' of those sources. To get those into df_sources, we merge it with df accordingly. Therefore it is also important to do the logic from top to bottom, to assure that the relevant of the previous objects has already been done. 
6. Another particular interest is for rhombus (previously entered data). Here we also need the relevant of the rhombus 
    itself, because it must be combined with the expresion by an AND. The rhombus itself is not seen to the user, 
    so the logic depends on his relevant. For the terms to be executed in the right order, the 'relevant' must be put 
    into brackets first. 
7. After those steps we have a df_sources dataframe where the 'expression' is correct for each of the arrows (each row). 
    As said in (4) they are combined with OR and written into the 'relevant' of the object we are looking at. 
8. Another major problem are pages that contain ONLY notes. As objects inside a page automatically inherit the relevant 
    of the page itself, their expression is entirely empty. The exit not then points to a target outside the page and 
    has no expression at all. The following object would then always be displayed 
    (or never, if there are other arrows pointing to)
    To deal with this we identify all those objects (groups that contain only notes and )
9. Another problem is when the first object in a page is a rhombus. It also gets no relevant generated. As a consequence, 
    we would get just the expression with 'and ()'

In [None]:
for elem in node_hierarchy:
    # df_sources: dataframe that contains all connections pointing to the object 'elem'
    df_sources = df_arrows.loc[df_arrows['target']==elem,['source','source_type','expression']]
    # pulling the relevant of the sources into df_sources. This corresponds to the logic to each elem. 
    # 'xml-parent' is needed for rhombus at beginning of a page
    df_sources = df_sources.merge(df[['relevant','xml-parent']],how='left',left_on='source',right_index=True) 

    # when the source is a rhombus and it's relevant IS empty and the rhombus is on a page
    # you have to combine the expression with the relevant of the page
    # first merge with df again to the the relevant of the page
    df_sources = df_sources.merge(df[['relevant']],how='left',left_on='xml-parent',right_index=True,suffixes=('', '_page'))
    m=df_sources['source_type'].isin(['rhombus']) & (df_sources['relevant']=='') & df_sources['xml-parent'].isin(container_ids)
    df_sources.loc[m,'expression'] = df_sources.loc[m,'expression'] + ' and (' + df_sources.loc[m,'relevant_page'] + ')'    
    
    # when the source is a rhombus and it's relevant is NOT empty, you have to combine both with AND
    m=df_sources['source_type'].isin(['rhombus']) & (df_sources['relevant']!='')
    df_sources.loc[m,'expression'] = df_sources.loc[m,'expression'] + ' and (' + df_sources.loc[m,'relevant'] + ')'
    
    # when the source is a note, integer or decimal, just take its relevant and put it into expression
    m=df_sources['source_type'].isin(['note', 'integer', 'decimal'])
    df_sources.loc[m,'expression'] = df_sources.loc[m,'relevant']    

    # when the source is a note that is pointing out of a page that only contains 'notes' use the page relevant as 
    # its expression 
    m=df_sources['source'].isin(df_notes_outof_pure_notes_pages.index) # Mask to get pure note elements
    df_sources.loc[m,'expression'] = df_sources.loc[m,'relevant_page']
    #page_id = df.loc[df_sources.loc[m,'source'],'xml-parent'] # # get the page_ids of the pages the elments are in
    # df_sources.loc[m,'expression'] = df.loc[page_id,'relevant'].to_list()
    
    if df.loc[elem,'odk_type']!='count':
        df.loc[elem,'relevant'] = ' or '.join(filter(None,df_sources['expression']))
        if '( and (' in df.loc[elem,'relevant'] or '( or (' in df.loc[elem,'relevant'] or '()' in df.loc[elem,'relevant']:
            print(elem, 'error!')
            print(df.loc[elem,'relevant'])
    else:
        # for counters the joining is number + number
        df.loc[elem,'relevant'] = ' + '.join(filter(None,df_sources['expression'])) 
        if '( and (' in df.loc[elem,'relevant'] or '( or (' in df.loc[elem,'relevant'] or '()' in df.loc[elem,'relevant']:
            print(elem, 'error!')
            print(df.loc[elem,'relevant'])   

### Remove rhombus nodes from graph and df

In [None]:
# taking out rhombus objects of the graph
rhombus_id = df.loc[df['odk_type']=='rhombus'].index
new_edges=list(dag.edges)

for node in rhombus_id: 
    new_edges = gt.cut_node(new_edges,node)
    
dag = nx.from_edgelist(new_edges, create_using=nx.DiGraph)
df.drop(df.loc[df['odk_type']=='rhombus'].index, inplace=True) # dropping rhombus from df

### write node attributes

In [None]:
# write node 'relevant', 'names', 'types' and 'text' as attribute to graph
n = dict(zip(df.index, df['relevant']))
nx.set_node_attributes(dag, n, name = 'relevant')
n = dict(zip(df.index, df['name']))
nx.set_node_attributes(dag, n, name = 'name')
n = dict(zip(df_raw['id'], df_raw['odk_type']))
nx.set_node_attributes(dag, n, name = 'type')
n = dict(zip(df.index, df['value']))
nx.set_node_attributes(dag, n, name = 'text')

### make a select-multiple for diagnosis and connect it with the dataloader

In [None]:
# connecting the diagnosis to a select multiple and to the dataloader should happen
# before the relevant is built!

In [None]:
# add a 'select_multiple diagnosis' at the beginning of the graph
# connect diagnosis - calculates to it
# add a relevant to the diagnosis: it is needed to deal with multiple entry nodes in the TT diagram
# :it will also be kept for a standalone TT where the user selects the diagnosis
n = 'select_diagnosis'
n_attrib = {'name':'select_diagnosis', 'relevant':'', 'type':'select_multiple', 'text':'Select diagnosis'}
dag = gt.add_calculate_selector(dag, n, n_attrib, diagnose_id_hierarchy)
# add the diagnosis to df_choices
n_diagnoses = [(i, dag.nodes[i]['name'], dag.nodes[i]['text']) for i in diagnose_id_hierarchy]
list_name = 'select_diagnosis'
df_choices = oh.add_calculate_to_choices(dag, n_diagnoses, list_name, df_choices)

In [None]:
# connect the dataloader with the 'select_multiple diagnosis' node
# this insures that the dataloader elements show up on top of the form and not at the bottom
id_dataloader = df_raw.loc[df_raw['value']=='Load Data', 'id'].iloc[0] # get ID of the dataloader
dag.add_edge(id_dataloader, 'select_diagnosis') # connect dataloader to select_diagnosis

In [None]:
# add a 'select_dataload' multiple choice that has the calculates of the dataloader as select_options
# this will allow to set them on startup
# adapt also the relevant of those calculates so they react to the select_multiple dataloader 
n = 'select_dataload'
n_attrib={'name':'select_dataload', 'relevant':'', 'type':'select_multiple', 'text':'Select previous data'}
dataloader_calculates = [i for (i,j) in dag.in_edges(id_dataloader) if dag.nodes[i]['type']=='calculate']
dag = gt.add_calculate_selector(dag, n, n_attrib, dataloader_calculates)
# add the diagnosis to df_choices
list_name = 'select_dataload'
n_dataload_calculates = [(i, dag.nodes[i]['name'], dag.nodes[i]['text']) for i in dataloader_calculates]
df_choices = oh.add_calculate_to_choices(dag, n_dataload_calculates, list_name, df_choices)

### Calculate the longest path and select_option hierarchy for topological sorting

In [None]:
# as indicated in the conceptual document, this step must be done before contracting the nodes, in oder
# to avoid that diagnosis branches are mixed up after contracted nodes. 

In [None]:
# 1. diagnosis and data_loader calculates can now be selected in a select_multiple. 
# If the right order given, the graph will be correctly sorted
# for this to work we simulate a select_option ordering of the diagnosis
# 2. To this we concat the other select_options of the TT graph (if existant)
# -> we have a complete TT graph that can be sorted by the same functions as the DX graph
a = linspace((len(diagnose_id_hierarchy)+1)/100, 0.01, num=len(diagnose_id_hierarchy)+1)
d = dict(zip(diagnose_id_hierarchy, a))  # diagnosis priority 
opt_prio = gt.hierarchy_select_options(df_raw) # hierarchy of select_options in the form
opt_prio = d | opt_prio # combine both

In [None]:
# get a graph entry point (typically a node pointing to the dataloader)
rootelement = gt.get_graph_entry_point(dag, df_raw)

In [None]:
dist = gt.get_longest_path_lengths(dag, rootelement, opt_prio, df_raw)

In [None]:
# assign distance (longest_path_length) to the nodes in dag
nx.set_node_attributes(dag, dist, name = 'distance_from_root')

### Contract nodes

As you combine two duplicates, you must add the 'relevance' of the 'predecessor' node to the relevance of the successor node before contraction

In [None]:
df.drop(df[df['odk_type']==''].index, inplace=True) # drop elements that do not belong to the form
df['filename']=df['value'].apply(ch.clean_name) # clean name for grouping

In [None]:
# how to do it: 
# groupby (filename, odk_type) combo
# iterate over all groups
# make a list of all pairwise combinations of nodes per group and iterate
# contract each pair and see if you still have a DAG
# Hint: in networkx nx.contracted_nodes(G,a,b) merges the nodes a and b to one new node CALLED 'a', 
# the attributes of 'b' end up as the 'contraction' attribute in 'a'
# if yes, take the relevant of all the predecessor nodes of k
# combine with OR and write it to all succesor nodes with AND

In [None]:
# group by duplicates
dag2 = nx.DiGraph()
while dag != dag2:    
    g = df.groupby(['filename', 'odk_type'])
    for i, frame in g: 
        if len(frame)>1:
            nodes = list(frame.index)
            for j, k in combinations(nodes, 2):
                if j in dag.nodes() and k in dag.nodes():
                    dag2 = nx.contracted_nodes(dag, j,k, self_loops=False)
                    if nx.is_directed_acyclic_graph(dag2):
                        # 1. Get the relevant of all predecessors of k
                        r = ['(' + dag.nodes[j]['relevant'] + ')' for j in dag.predecessors(k) if dag.nodes[j]['relevant'] !='']
                        if len(r)>0:
                            for s in dag.successors(k):
                                # 2. Append elements of 'r' joined with 'or' to all the successors 's' of 'k' with an 'and'
                                dag.nodes[s]['relevant']= '(' + dag.nodes[s]['relevant'] + ')' + ' and (' + ' or '.join(r) + ')'
                            # 3. replace dag by dag2
                        dag = dag2
                    else:
                        print('In node', i, 'did not merge', j, 'and', k)

In [None]:
# make a combined relevance attribute out of the original 'relevant' and the 'contraction' attribute
[gt.make_node_relevant(dag, n) for n in dag.nodes if 'contraction' in dag.nodes[n]]

In [None]:
# take the max longest path from the original 'distance_from_root' and those from the contracted nodes
[gt.make_node_distance(dag, n) for n in dag.nodes if 'contraction' in dag.nodes[n]]

In [None]:
# get the CDSS compatible topological sorting of the graph
topo_order = gt.topo_sort_cdss_attrib(dag, 'distance_from_root') # the complete sorting of the graph

In [None]:
df=df.reindex(topo_order)

### Sorting nodes

In [None]:
# 1. diagnosis and data_loader calculates can now be selected in a select_multiple. 
# If the right order given, the graph will be correctly sorted
# for this to work we simulate a select_option ordering of the diagnosis
# 2. To this we concat the other select_options of the TT graph (if existant)
# -> we have a complete TT graph that can be sorted by the same functions as the DX graph
a = linspace((len(diagnose_id_hierarchy)+1)/100, 0.01, num=len(diagnose_id_hierarchy)+1)
d = dict(zip(diagnose_id_hierarchy, a))  # diagnosis priority 
opt_prio = gt.hierarchy_select_options(df_raw) # hierarchy of select_options in the form
opt_prio = d | opt_prio # combine both

In [None]:
# get a graph entry point (typically a node pointing to the dataloader)
rootelement = gt.get_graph_entry_point(dag, df_raw)

In [None]:
# get the CDSS compatible topological sorting of the graph
topo_order = gt.get_topo_sort_cdss(dag, rootelement, opt_prio, df_raw) # the complete sorting of the graph

In [None]:
df=df.reindex(topo_order)

### Taking into account pages

In [None]:
'''
The topological sorting does not take into account pages (page-containers). Objects that are on the same page, must be 
grouped in order to wrap them up in begin_group ... end_group in odk. The topological_sort does not know what. 
Therefore we resort df: all objects that belong the a page, get all ligned up below the page container, 
preserving their overall sorting in df.
'''
pageids = df.loc[df['odk_type']=='container_page'].index
df.reset_index(drop=False, inplace=True)
df['new_index']=pd.to_numeric(df.index)
pagerows = df.loc[df['odk_type']=='container_page'].index
df = df.merge(df[['id','new_index']], how='left', left_on='xml-parent', right_on='id', suffixes = ('', '_p'))
df.loc[df['new_index_p'].notna(), 'new_index'] = df['new_index_p']
df.drop(columns=['id_p','new_index_p'], inplace=True)

g = df.groupby('xml-parent') # group by pages
for name, frame in g:  # for each page
    k=0.001
    for i in frame.index: # for each element in that page
        if df.loc[i,'xml-parent'] in pageids: # if we are in a real page and not in root
            df.loc[i,'new_index'] = df.loc[i,'new_index']+k # add to the new index a small step
            k+=0.001
            
df.set_index('new_index', drop=True, inplace = True)
df.sort_index(inplace=True)
df.set_index('id', drop=True, inplace=True)

In [None]:
# add 'end group' rows
# get the last objects of each page
df.reset_index(drop=False, inplace=True)
index_endgrouprows = df.loc[~df.duplicated(subset='xml-parent', keep='last') & df['xml-parent'].isin(pageids)].index+0.1

df_endgroup = pd.DataFrame(index=index_endgrouprows)
df_endgroup['odk_type']='end group'
df_endgroup['id']=df_endgroup.index

df = pd.concat([df_endgroup, df])
df.sort_index(inplace=True)
df.set_index('id', drop=True, inplace=True)

### update df via dag

In [None]:
# making df look like the 'survey' tab in an xls form
df[['repeat_count','appearance','required','required message::en','calculation']]=''
df=df[['odk_type','name','value','help::en','hint::en','appearance','relevant','constraint', \
       'constraint_message','required','required message::en','calculation','repeat_count','image::en']]
df.rename(columns={'odk_type':'type','value':'text','constraint_message':'constraint message::en'},inplace=True)

In [None]:
d = {n:dag.nodes[n] for n in dag.nodes}
dfa = pd.DataFrame.from_dict(d, orient='index')
df.update(dfa)
df.rename(columns={'text':'label::en', 'relevant':'relevance'},inplace=True)

### make df look according to xform standard

In [None]:
# short term workaround for select_xxx + NAME to add the same name as list_name
m = df['type'].isin(['select_one','select_multiple'])
df.loc[m,'type'] = df.loc[m,'type'] + ' ' + df.loc[m,'name']

# rename begin group
df.replace({'container_page':'begin group'}, inplace=True)
# add 'field-list'
df.loc[df['type']=='begin group','appearance']='field-list'

# in 'calculate' fields move 'relevance' to calculate
df.loc[df['type']=='calculate','calculation'] = df.loc[df['type']=='calculate','relevance']
# add 'number() to fit with odk '
df.loc[df['type']=='calculate','calculation'] = 'number(' + df.loc[df['type']=='calculate','calculation'] + ')'
# delete entry in relevance column of 'calcuate' rows
df.loc[df['type']=='calculate','relevance'] = ''

### make df_choices look according to xform standard

In [None]:
# making df_choices look like the 'choices' tab in an xls form
df_choices.drop(columns=['odk_type'],inplace=True)

### make a settings tab

In [None]:
# make a 'settings' tab
now = datetime.now()
version=now.strftime('%Y%m%d%H%M')
indx=[[1]]

settings={'form_title':form_title,'form_id':form_id,'version':version,'default_language':'en','style':'pages'}
df_settings=pd.DataFrame(settings,index=indx)
df_settings.head()

### make a summary

In [None]:
import summary
df_summary = summary.make_summary(df, df_choices, diagnose_id_hierarchy, summaryfile)

In [None]:
%store df_summary

### make constraint message

In [None]:
# populate constraint message to all select_multiple
df.loc[df['type'].str.contains('select_multiple',na=False),'constraint']='.=\'opt_none\' or not(selected(.,\'opt_none\'))'
df.loc[df['type'].str.contains('select_multiple',na=False),'constraint message::en']='**None** cannot be selected together with symptoms.'

### load additional rows from external xls form 

In [None]:
df = loadcalc(df, drugsfile, form_id)

### make a countdown timer

In [None]:
'''
From CHT Docs

Countdown Timer: A visual timer widget that starts when tapped/clicked, and has an audible alert when done. 
To use it create a note field with an appearance set to countdown-timer. 
The duration of the timer is the field’s value, which can be set in the XLSForm’s default column. 
If this value is not set, the timer will be set to 60 seconds.

Currently not implemented in TRICC, but hard coded here
'''
df.loc[df['label::en'].str.contains('START',na=False),'appearance']='countdown-timer'

### Caretaker advice

In [None]:
d = ca.ca_expressions(df_raw, cafile)
df = ca.update_ca_relevance(df, d)

### add a diagnose message immediately after the diagnose (for MSFeCARE)

In [None]:
if form_id!='almsom':
    # show the detected diagnose right on detection

    # read the diagnoses and the corresponding ids
    df_diagnoses = pd.read_csv(diagnose_order)
    diagnoses_dict=dict(zip(df_diagnoses.Name,df_diagnoses.id))

    df.reset_index(inplace=True)
    df.fillna('',inplace=True)
    I = df.loc[df['name'].isin(diagnoses_dict.values())].index

    for i in I:
        d_message = pd.DataFrame({'index':df.loc[i]['index']+'_dm','type': 'note', \
                                    'name':'dm_' + df.loc[i]['name'],'label::en':\
                                    'Treatment for Diagnose: ' + df.loc[i]['label::en'],\
                                    'relevance':'number(${'+df.loc[i]['name']+'})=1'}, index=[i+0.1])

        #df = df.append(d_message, ignore_index=False)
        df = pd.concat([d_message, df], ignore_index=False)


    # colorize the dm message
    m = df['name'].str.contains('dm_',na=False)
    df.loc[m,'label::en'] = '<span style="color: rgb(68, 28, 28);">' + df.loc[m,'label::en'] + '</span>'

    # sort rows and reset index
    df = df.sort_index()
    df.set_index('index',inplace=True)

### Change appearance of help fields

In [None]:
if form_id!='almsom':
    from helpfields import helpfields
    df = helpfields(df)

### add required = true to all data entry fields

In [None]:
df.loc[~df['type'].isin(['note','calculate','begin group','end group','text', 'acknowledge', '']) & (df['required']==''),'required']='true()'
# but not to contextual parameters
df.loc[df['name']=='data_load','required']=''

### combine multiple instances of a calculate

In [None]:
# probably no longer necessary calcultes should have been combined by node contraction
# only those where node contraction would have created a loop remain, but there are probably none left
df = calcombo(df, df_raw)

### handle duplicates in df

In [None]:
df.loc[df.duplicated(subset=['name'],keep=False),'name']=df['name']+df.index

### write xls form

In [None]:
#create a Pandas Excel writer using XlsxWriter as the engine
writer = pd.ExcelWriter(ttfile, engine='xlsxwriter')


df.to_excel(writer, sheet_name='survey',index=False)
df_choices.to_excel(writer, sheet_name='choices',index=False)
df_settings.to_excel(writer, sheet_name='settings',index=False)

#close the Pandas Excel writer and output the Excel file
writer.save()

# run this on a windows python instance because if not then the generated xlsx file remains open
writer.close()
writer.handles = None

In [None]:
# what's left to be done: 
# 1. if TT is to be merged with dx: handle the dataloader -> see if this is not done at the merge script level
# 2. Put data_load into a group

# 4. for somalia: integrate html files
# 5. for msf: contract nodes based on 'name' not on 'text' -> adapt drawing accordingly
# 6. at the very end: handle duplicate names

# 8. for alm som: make the CA advice, and get rid of the 'come back' messages
# 9. Write output into a logfile, rather then into the jupyter notebook

In [None]:
# for almanach somalia only

In [None]:
# make the global flow
from formconverters import df2xlsform # makes xlsforms out of dataframes
df2xlsform(df, df_choices, df_settings, '/home/rafael/Documents/git/cht-core/config/ecare/forms/app/almsom.xlsx')

In [None]:
%%bash
(cd /home/rafael/Documents/git/cht-core/config/ecare/ && cht --url=https://medic:password@localhost --accept-self-signed-certs convert-app-forms upload-app-forms -- almsom)