# Build Keyword Lists

## Description

* Read CMG keywords
    * Extract from 2022.10 manuals  
    * Search in the \Data\Index_Chunk0.js files (0 to 9)
        * Regex:

        `'([^']+)':\{l:\[\{u:'([^']+)',t:'([^']+')`
        
        * First group: keyword.
            * Add to keyword list.
        * Second group: htm file with full description.
            * Ignore anything including and after '#' and read file.
            * Search all keywords and options inside file:

            `(<td>|<dt>|<tr>|<p>|\| |, |; )\(?<span class="keyword">[^*]*\*([^<]+)</span>`
        
            * Ignore any keyword that is found in the short description, add all others as options.
        * Third group: short description, with all keywords described in the htm file
            * Extract all associated keywords to be used in options search.
        
            `\*([0-9a-z-_]+)`

* Compile sets with keywords and options.
* Write lines in the VsCode language syntax.
    * Keywords are divided by first letter to reduce size of line in json file.

## Requirements

In [6]:
from pathlib import Path
import re

In [85]:
with open(r'../../../common_folder.txt', 'r', encoding='utf-8') as f:
    common_folder = Path(f.readline().strip()) 
simulators = {'GEM', 'IMEX', 'STARS'}
output_keywords = 'keywords.txt'
output_options = 'options.txt'

## Common Functions

In [63]:
def write_set_to_text_file(file_path, input_set):
    file_path = Path(file_path)
    try:
        content = '\n'.join(map(str, input_set))
        file_path.write_text(data=content, encoding='utf-8')
        # print(f"Set has been written to '{file_path}'.")
    except Exception as e:
        print(f"An error occurred while writing to '{file_path}': {e}")

def read_text_file(file_path):
    file_path = Path(file_path)
    try:
        content = file_path.read_text(encoding='utf-8')
        return content
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    
def read_file_into_set(file_path):
    content = read_text_file(file_path)
    if content is not None:
        return content.split('\n')
    else:
        return set()

## Parse Files

In [100]:
def find_index_files(folder_path):
    files = []
    for file_path in folder_path.rglob('Index_Chunk*.js'):
        if file_path.is_file():
            files.append(file_path)
    return files

def parse_description_file(file_path, keywords):
    content = read_text_file(file_path)
    matches = re.findall(r'(<td>|<dt>|<tr>|<p>|\| |, |; )\(?<span class="keyword">[^*]*\*([^<]+)</span>', content)

    options = set()
    for match in matches:
        if match[1] not in keywords:
            if not match[1].isdigit():
                options.add(match[1])
    return options

def parse_index_file(file_path):
    content = read_text_file(file_path)
    matches = re.findall(r"'([^']+)':\{l:\[\{u:'([^']+)',t:'([^']+')", content)

    data = dict()
    for match in matches:
        file = match[1].split('#')[0]
        description = match[2].split(' *')[0]
        keywords = [x[0] for x in re.findall(r"\*((\w|-)+)", match[2])]
        htm_file = Path(str(file_path.parent.parent) + file)
        options = parse_description_file(htm_file, keywords)
        data[match[0]] = {'file': file, 'description':description, 'keywords':keywords, 'options':options}
    return data

def read_index(folder):
    print(f'Reading {folder}')
    index_files = find_index_files(folder)
    print(f'  Found {len(index_files)} index files')
    data = dict()
    for index_file in index_files:
        data.update(parse_index_file(index_file))
    print(f'  {len(data)} keywords found.')
    # write_set_to_text_file('test.txt', sorted(keywords.keys()))
    return data

In [101]:
data = dict()
for simulator in simulators:
    data[simulator] = read_index(common_folder / simulator)

Reading C:\Users\tiago.LENOVO-I7\Unicamp\Documentos\05.Livros\Manuais\CMG\GEM
  Found 10 index files
  1223 keywords found.
Reading C:\Users\tiago.LENOVO-I7\Unicamp\Documentos\05.Livros\Manuais\CMG\IMEX
  Found 7 index files
  1038 keywords found.
Reading C:\Users\tiago.LENOVO-I7\Unicamp\Documentos\05.Livros\Manuais\CMG\STARS
  Found 9 index files
  1253 keywords found.


## Create Lists

In [102]:
keywords = dict()

keywords['ALL'] = set()
for simulator in simulators:
    keywords[simulator] = set(data[simulator].keys())
    keywords['ALL'] = keywords['ALL'].union(keywords[simulator])

for simulator,keyword_set in keywords.items():
    print(f'{simulator}\t{len(keyword_set)} keywords')

print('')

options = dict()

options['ALL'] = set()
for simulator in simulators:
    options[simulator] = set()
    for opts in data[simulator].values():
        options[simulator] = options[simulator].union(opts['options'])
    options['ALL'] = options['ALL'].union(options[simulator])

for simulator,options_set in options.items():
    print(f'{simulator}\t{len(options_set)} options')

ALL	1935 keywords
GEM	1223 keywords
IMEX	1038 keywords
STARS	1253 keywords

ALL	2108 options
GEM	1227 options
IMEX	1248 options
STARS	1065 options


## Write files

In [113]:
def subsets_by_first_char(input_set):
    subsets = {}
    for item in input_set:
        first_char = item[0]
        if first_char not in subsets:
            subsets[first_char] = set()
        subsets[first_char].add(item)
    return subsets

def output_set(values, file_path):
    with open(file_path, 'w') as file:
        for simulator,keyword_set in values.items():
            file.write(f"'{simulator}'\n")
            for sub_keyword_set in subsets_by_first_char(sorted(keyword_set)).values():
                file.write('\t\t\t\t},{\n')
                file.write('\t\t\t\t\t"name": "entity.name.function",\n')	
                set_str = '|'.join(map(str, sorted(sub_keyword_set))).replace('-/+','(-|+)').replace('+','\\\\+')
                file.write(f'\t\t\t\t\t"match": "^\\\\s*\\\\*?({set_str})(?=(\\\\s|$))"\n')
            file.write('\n')

In [114]:
output_set(keywords, output_keywords)
output_set(options, output_options)