In [None]:
!pip install openai

In [282]:
import re
import openai
from tqdm import tqdm

## Read

In [166]:
with open('../files/words_data.txt', 'r', encoding='utf-8') as f:
    words = f.read()

In [223]:
words_list = words.split('\n')

## Cleaning

In [305]:
emphasis_symbol = words_list[1][5]
words_list[1][5]

'́'

In [306]:
def process_record(word_record):
    # replace emphasis and pseudo 'i' symbol
    word_record = word_record \
        .replace(emphasis_symbol, '') \
        .replace('[', '') \
        .replace(']', '') \
        .replace('í', 'і') \
        .replace('á', 'а') \
        .replace('é', 'е') \
        .replace('ý', 'у') \
        .replace('ó', 'о') \
        .replace('й/і', 'ї') \
        .replace("'/а", '/я') \
        .replace("'/у", '/ю') \
        .replace("й/а", 'я') \
        .replace("й/у", 'ю') \
    # cut whitespaces
    word_record = word_record.strip()
    return word_record
    

In [307]:
clean_words_list = [process_record(word) for word in words_list if len(word) > 1]

In [308]:
with open("../files/polyga_clean_word_data.txt", 'w', encoding='utf-8') as f:
    f.write('\n'.join(clean_words_list))

## Parsing word forms

In [310]:
word_forms_pattern = r'\(~(.*?)\)'

In [311]:
def extract_word_forms(word_record):
    word_forms_findings = re.findall(word_forms_pattern, word_record)
    flattened_forms_list = []
    for word_form in word_forms_findings:
        flattened_forms_list.extend(word_form.split(', ~'))
    word = word_record.split(' ')[0]
    return (word, flattened_forms_list)

In [312]:
words_with_forms = [extract_word_forms(word_record) for word_record in clean_words_list]

In [313]:
words_with_forms[:10]

[('абажур/н/ий', []),
 ('абажур/чик', []),
 ('абат/ис/а', []),
 ('абат/ств/о', []),
 ('абетк/а', ['тц/і', 'ток']),
 ('абетк/ов/ий', []),
 ('абіссин/ськ/ий', []),
 ('абіурієнт/к/а', ['т/ц/і', 'т/ок']),
 ('аблакт/ува/ти', []),
 ('аблакц/і/он/ізм', [])]

#### Script

In [314]:
def find_last_occurence(text, target_symbol):
    for indx, symbol in enumerate(reversed(text)):
        if symbol == target_symbol:
            return len(text) - 1 - indx
    return -1

def explode_word_forms(word_with_forms):
    word, forms = word_with_forms
    constructed_words = [word]
    for form in forms:
        form_first_symbol = form[0]
        word_last_indx = find_last_occurence(word, form_first_symbol)
        constructed_words.append(word[:word_last_indx] + form + '\t ---------')
    return constructed_words

In [315]:
word, forms = words_with_forms[4]
word, forms

('абетк/а', ['тц/і', 'ток'])

In [316]:
all_exploded_words = []

for processed_word in words_with_forms:
    exploded_words = explode_word_forms(processed_word)
    all_exploded_words.extend(exploded_words)

In [317]:
len(all_exploded_words)

45108

In [None]:
with open("../files/polyga_clean_word_data.txt", 'w', encoding='utf-8') as f:
    f.write('\n'.join(clean_words_list))

### Parsing with LLM

In [329]:
api_key = 'YOUR_API_KEY'

client = openai.OpenAI(api_key=api_key)

In [330]:
prompt = "I need to encode words forms from intup and get all forms as a list. \n I have an input of a word with proposed additional forms. Here is an example: 'абон/ент/к/а (т/ц/і, т/ок)'. I need to convert such input to a list of corresponding words, in this case 'абон/ент/к/а, абон/ент/ц/і, абон/ент/ок'. I need only words separated by comma as output. Now encode this for me - {input}"

In [331]:
def send_prompt_to_gpt(prompt, model="gpt-3.5-turbo", temperature=0.1):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=500,
    )
    return response.choices[0].message.content

In [333]:
all_words = []

for word, forms in tqdm(words_with_forms):
    if len(forms) == 0:
        all_words.append(word)
    else:
        formatted_input = f"{word} ({', '.join(forms)})"
        response = send_prompt_to_gpt(prompt.format(input=formatted_input))
        all_words.append(response)

In [337]:
with open("../files/polyga_clean_word_data_ai.txt", 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_words))