# Build Keyword Lists

## Description

* Read CMG keywords
    * Extract from 2022.10 manuals  
    * Search in the \Data\Index_Chunk0.js files (0 to 9)
        * Regex:

        `'([^']+)':\{l:\[\{u:'([^']+)',t:'([^']+')`
        
        * First group: keyword.
            * Add to keyword list.
        * Second group: htm file with full description.
            * Ignore anything including and after '#' and read file.
            * Search all keywords and options inside DEFINITIONS section in the file:

            `(<span class="keyword">[^*]*\*([^<]+\w)</span>`
        
            * Ignore any keyword that is found in the short description, add all others as options.
        * Third group: short description, with all keywords described in the htm file
            * Extract all associated keywords to be used in options search.
        
            `\*([0-9a-z-_]+)`

* Compile sets with keywords and options.
* Write lines in the VsCode language syntax.
    * Keywords are divided by first letter to reduce size of line in json file.

## Requirements

In [42]:
from pathlib import Path
import re
import json

In [43]:
with open(r'../../../common_folder.txt', 'r', encoding='utf-8') as f:
    common_folder = Path(f.readline().strip()) 
simulators = {'GEM', 'IMEX', 'STARS'}
output_keywords = 'keywords.txt'
output_options = 'options.txt'

## Common Functions

In [44]:
def write_set_to_text_file(file_path, input_set):
    file_path = Path(file_path)
    try:
        content = '\n'.join(map(str, input_set))
        file_path.write_text(data=content, encoding='utf-8')
    except Exception as e:
        print(f"An error occurred while writing to '{file_path}': {e}")

def read_text_file(file_path):
    file_path = Path(file_path)
    try:
        content = file_path.read_text(encoding='utf-8')
        return content
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    
def read_file_into_set(file_path):
    content = read_text_file(file_path)
    if content is not None:
        return content.split('\n')
    else:
        return set()

## Parse Files

In [52]:
def find_index_files(folder_path):
    files = []
    for file_path in folder_path.rglob('Index_Chunk*.js'):
        if file_path.is_file():
            files.append(file_path)
    return files

def parse_additional_options(content):
    pattern1 = r'<table[^>]*>(?:(?!<\/table>)[\s\S])*?>(?=.*>condition<\/span>)(?:(?!<\/table>)[\s\S])*<\/table>'
    matches = re.findall(pattern1, content)

    pattern2 = r"<td>(\s|\n)*(<p>)*([-_A-Z0-9]+)(\s+'?\w+'?)?(</p>)*(\s|\n)*</td>"
    pattern3 = r'<a name="(\w+)">[^<]*</a>'
    out = set()
    for m in matches:
        for s in re.findall(pattern2, m):
            if len(s) > 1:
                if s[2].strip() != '':
                    out.add(s[2].strip())
        for s in re.findall(pattern3, m):
            if s.strip() != '':
                out.add(s.strip())
    
    return out

def parse_description_file(file_path, keywords):
    content = read_text_file(file_path)
    
    long_description = ''
    purpose = re.findall(r'>PURPOSE:?</(h3|p)>(.*?)(?=<(h3)>|$)', content, re.DOTALL)
    if len(purpose) > 0:
        if len(purpose[0]) > 1:
            long_description = re.sub(r'<[^>]*>', '', purpose[0][1])  
            long_description = re.sub(r'\n', '', long_description).strip()
            long_description = re.sub(r'\s+', ' ', long_description)

    options = set()
    definitions = re.findall(r'>DEFINITIONS:</h3>(.*?)(?=<h3>|$)', content, re.DOTALL)
    if len(definitions) > 0:
        matches = re.findall(r'<span class="keyword">[^*]*\*([^<]*\w)</span>', definitions[0])

        for match in matches:
            if match not in keywords:
                if not match.isdigit():
                    options.add(match)

        options = options.union(parse_additional_options(definitions[0]))

    return long_description, options

def parse_index_file(file_path):
    content = read_text_file(file_path)
    matches = re.findall(r"'([^']+)':\{l:\[\{u:'([^']+)',t:'([^']+')", content)

    data = dict()
    for match in matches:
        file = match[1].split('#')[0]
        description = match[2].split(' *')[0]
        keywords = [x[0] for x in re.findall(r"\*((\w|-)+)", match[2])]
        htm_file = Path(str(file_path.parent.parent) + file)
        long_description, options = parse_description_file(htm_file, keywords)
        data[match[0]] = {'file': file, 
                          'description':description, 
                          'long description': long_description, 
                          'keywords':keywords, 
                          'options':options}
    return data

def read_index(folder):
    print(f'Reading {folder.name}')
    index_files = find_index_files(folder)
    print(f'  Found {len(index_files)} index files')
    data = dict()
    for index_file in index_files:
        data.update(parse_index_file(index_file))
    print(f'  {len(data)} keywords found.')
    return data

In [53]:
data = dict()
for simulator in simulators:
    data[simulator] = read_index(common_folder / simulator)

Reading STARS
  Found 9 index files
  1253 keywords found.
Reading GEM
  Found 10 index files
  1223 keywords found.
Reading IMEX
  Found 7 index files
  1038 keywords found.


### Output to a Json file

In [54]:
def convert_sets_to_lists(data):
    if isinstance(data, dict):
        return {key: convert_sets_to_lists(value) for key, value in data.items()}
    elif isinstance(data, set):
        return list(sorted(data))
    else:
        return data

In [55]:
data_with_lists = convert_sets_to_lists(data)

with open('data.json', 'w') as f:
    json.dump(data_with_lists, f, indent=4)

## Create Lists

In [56]:
keywords = dict()

keywords['ALL'] = set()
for simulator in simulators:
    keywords[simulator] = set(data[simulator].keys())
    keywords['ALL'] = keywords['ALL'].union(keywords[simulator])

for simulator,keyword_set in keywords.items():
    print(f'{simulator}\t{len(keyword_set)} keywords')

print('')

options = dict()

options['ALL'] = set()
for simulator in simulators:
    options[simulator] = set()
    for opts in data[simulator].values():
        options[simulator] = options[simulator].union(opts['options'])
    options['ALL'] = options['ALL'].union(options[simulator])

for simulator,options_set in options.items():
    print(f'{simulator}\t{len(options_set)} options')

ALL	1935 keywords
STARS	1253 keywords
GEM	1223 keywords
IMEX	1038 keywords

ALL	2071 options
STARS	1064 options
GEM	1316 options
IMEX	1268 options


## Write files

In [57]:
def subsets_by_first_char(input_set):
    subsets = {}
    for item in input_set:
        first_char = item[0]
        if first_char not in subsets:
            subsets[first_char] = set()
        subsets[first_char].add(item)
    return subsets

def output_set(values, file_path, type_name, is_keyword):
    with open(file_path, 'w') as file:
        for simulator in sorted(set(values.keys())):
            file.write(f"'{simulator}'\n")
            for sub_keyword_set in subsets_by_first_char(sorted(values[simulator])).values():
                file.write('\t\t\t\t},{\n')
                file.write(f'\t\t\t\t\t"name": "{type_name}",\n')	
                set_str = '|'.join(map(str, sorted(sub_keyword_set))).replace('+','\\\\+')
                if is_keyword:
                    file.write(f'\t\t\t\t\t"match": "^\\\\s*\\\\*?({set_str})(?=(\\\\s|$))"\n')
                else:
                    file.write(f'\t\t\t\t\t"match": "(?<=(\\\\s|^))\\\\*?({set_str})(?=(\\\\s|$))"\n')
            file.write('\n')

In [58]:
output_set(keywords, output_keywords, "entity.name.function", True)
output_set(options, output_options, "entity.name.type", False)