* Read lists of CMG keywords (extracted from 2023.10 manuals)
* Compile set with keywords:
    * By simulator
    * All keywords
    * Unique per simulator
    * Common among at least two simulators
    * Common among all simulators
* Write lines in the VsCode language syntax.
    * Keywords are divided by first letter to reduce size of line in json file.

In [72]:
filenames = {'GEM':'GEM.txt', 'IMEX':'IMEX.txt', 'STARS':'STARS.txt'}
output_filename = 'keywords.txt'

In [73]:
keywords = dict()
for simulator, filename in filenames.items():
    keywords[simulator] = set()
    with open(filename, 'r') as file:
        for line in file:
            keywords[simulator].add(line.strip())

In [74]:
pairs = set()
for item1 in filenames.keys():
    for item2 in filenames.keys():
        if item1 != item2:
            pairs.add(tuple(sorted((item1, item2))))

In [75]:
keywords['ALL'] = set()
for simulator in filenames.keys():
    keywords['ALL'] = keywords['ALL'].union(keywords[simulator])

keywords['COMMON'] = keywords['ALL'].copy()
for simulator in filenames.keys():
    keywords['COMMON'] = keywords['COMMON'].intersection(keywords[simulator])

for pair in pairs:
    keywords[f'Only {pair[0]}+{pair[1]}'] = keywords[pair[0]].intersection(keywords[pair[1]]) - keywords['COMMON']

for simulator in filenames.keys():
    keywords[f'Only {simulator}'] = keywords[simulator]
    for simulator2 in filenames.keys():
        if simulator != simulator2:
            keywords[f'Only {simulator}'] = keywords[f'Only {simulator}'] - keywords[simulator2]

for simulator,keyword_set in keywords.items():
    print(f'{simulator}\t{len(keyword_set)} keywords')


GEM	1247 keywords
IMEX	1050 keywords
STARS	1276 keywords
ALL	1952 keywords
COMMON	704 keywords
Only GEM+STARS	95 keywords
Only GEM+IMEX	95 keywords
Only IMEX+STARS	23 keywords
Only GEM	353 keywords
Only IMEX	228 keywords
Only STARS	454 keywords


In [76]:
def subsets_by_first_char(input_set):
    subsets = {}
    for item in input_set:
        first_char = item[0]
        if first_char not in subsets:
            subsets[first_char] = set()
        subsets[first_char].add(item)
    return subsets

In [77]:
with open(output_filename, 'w') as file:
    for simulator,keyword_set in keywords.items():
        file.write(f"'{simulator}'\n")
        for first_char, sub_keyword_set in subsets_by_first_char(sorted(keyword_set)).items():
            file.write('\t\t\t\t},{\n')
            file.write('\t\t\t\t\t"name": "entity.name.function",\n')	
            set_str = '|'.join(map(str, sorted(sub_keyword_set)))
            file.write(f'\t\t\t\t\t"match": "^\\\\s*\\\\*?({set_str})(?=(\\\\s|$))"\n')
        file.write('\n')