In [2]:
import pandas as pd
from tqdm import tqdm

def process_dataframe(df, output_file):
    # Open the output file in write mode and create a file object to write to.
    with open(output_file, 'w', encoding='utf-8') as f:
        # Write the YAML header to the output file.
        f.write('name: jenfixer-en\n')
        f.write('parent: default\n')
        f.write('matches:\n')
        
        # Iterate over each row in the input DataFrame.
        for i, row in df.iterrows():
            # Extract values from the row and remove leading/trailing whitespace.
            trigger = row[0].strip()
            replace = row[1].strip()
            word = str(row[2]).lower()
            propagate_case = str(row[3]).lower()
            test_val = row[4]
            
            # Write the appropriate YAML output to the output file based on the value of test_val.
            if test_val:
                f.write(f"- trigger: \"{trigger}\"\n")
                f.write(f"  replace: \"{replace}\"\n")
                f.write(f"  word: {word}\n")
                f.write(f"  propagate_case: {propagate_case}\n")
            else:
                f.write(f"- triggers: {trigger}\n")
                f.write(f"  replace: \"{replace}\"\n")
                f.write(f"  word: {word}\n")
                f.write(f"  propagate_case: {propagate_case}\n")

def assert_triggers(df):
    """
    This function takes a pandas dataframe as input and returns the same dataframe after modifying the 'trigger' column. 
    It ensures that each entry in the 'trigger' column is a list of strings, with duplicates removed.
    """
    
    for i in df[df[df.columns[4]] == 0].index:
        # Select the current trigger value
        trigger = df.at[i, 'trigger']
        
        # Initialize the list with square brackets
        line = "[]"
        
        # Remove any single quotes
        trigger = trigger.replace("'", "")
        
        # Remove square brackets
        trigger = trigger.replace('[', '')
        trigger = trigger.replace(']', '')
        
        # Split the string on commas
        trigger = trigger.split(',')
        
        # Loop through each trigger string
        for k in range(len(trigger)):
            j = trigger[k]
            
            # Remove any white space
            j = j.strip()
            
            # Remove double quotes
            j = j.replace('"','')
            
            # Add the trigger string to the list
            if k == 0:
                line = line.replace(']','')
                line += f'"{j}"]'
            else:
                # Check if the trigger string is already in the list
                if j in line:
                    # Skip the duplicate trigger string
                    pass
                else:
                    line = line.replace(']','')
                    line += f', "{j}"]'
        
        # Update the trigger value in the dataframe
        df.at[i, 'trigger'] = line
        
    # Return the modified dataframe
    return df

def yml_to_df(path):
    # Read the contents of the file at the given path and split it into lines
    data = open(path).read().split('\n')

    # Create an empty DataFrame with columns 'trigger', 'replace', 'propagate_case', 'word', and 'trigger=1'
    df = pd.DataFrame(columns=['trigger','replace', 'propagate_case', 'word', 'trigger=1'])

    # Loop through each line in the data
    for i in tqdm(range(len(data))):
        # If the line starts with '- triggers'
        if '- triggers' in data[i]:
            # Extract the 'trigger' value from the line
            trigger = data[i].replace('- triggers: ', '')
            # Extract the 'replace' value from the next line
            replace = data[i + 1].replace('  replace: ', '').replace('"','')
            # Extract the 'word' value from the line after that
            word = data[i + 2].replace('  word: ','')
            # Extract the 'propagate_case' value from the line after that
            prop = data[i + 3].replace('  propagate_case: ','')
            # Set the value of 'more' to 0
            more = 0

            # Combine the extracted values into a list
            line = [trigger, replace, prop, word, more]
            # Append the list as a new row to the DataFrame
            df.loc[len(df)] = line 
        # If the line starts with '- trigger'
        elif '- trigger' in data[i]:
            # Extract the 'trigger' value from the line
            trigger = data[i].replace('- trigger: ', '').replace('"','')
            # Extract the 'replace' value from the next line
            replace = data[i + 1].replace('  replace: ', '').replace('"','')
            # Extract the 'word' value from the line after that
            word = data[i + 2].replace('  word: ','')
            # Extract the 'propagate_case' value from the line after that
            prop = data[i + 3].replace('  propagate_case: ','')
            # Set the value of 'more' to 1
            more = 1

            # Combine the extracted values into a list
            line = [trigger, replace, prop, word, more]
            # Append the list as a new row to the DataFrame
            df.loc[len(df)] = line 
        
    # Return the resulting DataFrame
    return df


In [3]:
# path = r'C:\Users\jbay\AppData\Roaming\espanso\match\packages\typofixer-en\package.yml'
# df = yml_to_df(path)

path = r'C:\Users\jbay\AppData\Roaming\espanso\match\packages\jenfixer-en\package.yml'
df = yml_to_df(path)

100%|██████████| 3688/3688 [00:01<00:00, 2347.46it/s]


In [6]:
df

Unnamed: 0,trigger,replace,propagate_case,word,trigger=1
0,'indentical','identical',true,true,1
1,'indepedence','independence',true,true,1
2,'independendet','independent',true,true,1
3,'indespensable','indispensable',true,true,1
4,'indespensible','indispensable',true,true,1
...,...,...,...,...,...
919,'ytou','you',true,true,1
920,mial,mail,true,true,1
921,trhough,through,true,true,1
922,aswell,as well,true,true,1


In [14]:
for t in df['replace']:
    if "'" in t:
        df.loc[df['replace'] == t, "replace"] = t.replace("'", "")

t

'technical'

In [15]:
df

Unnamed: 0,trigger,replace,propagate_case,word,trigger=1
0,indentical,identical,true,true,1
1,indepedence,independence,true,true,1
2,independendet,independent,true,true,1
3,indespensable,indispensable,true,true,1
4,indespensible,indispensable,true,true,1
...,...,...,...,...,...
919,ytou,you,true,true,1
920,mial,mail,true,true,1
921,trhough,through,true,true,1
922,aswell,as well,true,true,1


In [16]:
# purpose is to assert that no trigger replace pair exists twice
# for i in range(len(new_df)):
#     word    = new_df.at[i, 'replace'].replace("'",'')
#     trigger = new_df.at[i, 'trigger'].replace("'",'')
#     if word in df['replace'].values and trigger in df['trigger'].values:
#         print(word)
#         #new_df.drop(i, axis=0, inplace=True)

In [18]:
# Importing the new words data from the CSV file
new_words = pd.read_csv('to_add.csv', delimiter=";")

# Loop through all unique trigger words in the new words data that have 'english' as the language
for word in new_words[new_words[new_words.columns[2]] == 'english'].trigger.unique():
    # Check if the current trigger word is already in the main dataframe
    if word in df['trigger'].values:
        # If it is, move on to the next trigger word
        pass
    else:
        # If it is not, add a new row to the main dataframe with the trigger word, its corresponding replace word, 
        # 'true' for propagate_case, 'true' for word, and 1 for trigger=1
        df.loc[len(df)] = [word,
                           new_words[new_words[new_words.columns[0]] == word]['replace'].values[0],
                           'true',
                           'true',
                           1]
        

df.sort_values('replace', inplace=True)

# Call the process_dataframe function with the main dataframe and the path as arguments
process_dataframe(df, path)


In [19]:
df

Unnamed: 0,trigger,replace,propagate_case,word,trigger=1
207,ninties,1990s,true,true,1
65,Januray,January,true,true,1
172,Montnana,Montana,true,true,1
175,Morisette,Morissette,true,true,1
194,Naploeon,Napoleon,true,true,1
...,...,...,...,...,...
916,yeras,years,true,true,1
914,yeilding,yielding,true,true,1
919,ytou,you,true,true,1
918,youseff,yousef,true,true,1


In [162]:
for u in df[(df['trigger=1'] > 1) & (~df['trigger'].str.contains('\['))]['replace'].unique():
    line = '['

    for each in df[df['replace']==u].trigger.unique():
        line +=f'"{each}", '
        

    line += ']'

    line = line.replace(', ]', ']')
    
    iterator = 0

    for i in df[df['replace'] == u].index.unique():
        if iterator == 0:
            df.at[i, 'trigger'] = line
            df.at[i, 'trigger=1'] = 0
        else:
            df.drop(i,
                    axis = 0,
                    inplace=True)
            
        iterator += 1

    df[df['replace'] == u]


In [165]:
current_triggers = []

for i in df.trigger:
    if '[' in i:
        test = i.replace('[', '')
        test = test.replace(']', '')
        test = test.replace('"','')
        for word in test.split(','):
            current_triggers.append(word)
    else:
        current_triggers.append(i)

new_words = {"orchistration": "orchestration",
             "sepnt": "spent",
             "htis": "this",
             }

new_df = pd.read_csv(r'C:\Users\jbay\AppData\Roaming\espanso\match\scripts\to_add.csv', delimiter=";")
for each in new_df[new_df.language == 'english'].index:
    trigger = new_df.at[each, 'trigger']
    print(trigger)
    if trigger in new_words.keys():
        pass
        #print(trigger)
    else:
        new_words[trigger] = new_df.at[each, 'replace']
        
new_words

htis
eachother
Thnak
undersootd
any thing
Chiciago
aswell
ahold
amonut
arent
bteween
abandonned
aberation
abilityes
abilties
abilty
abondon
abbout
abotu
abouta
aboutit
aboutthe
abscence
abondoned
abondoning
abondons
aborigene
accesories
accidant
abortificant
abreviate
abreviated
abreviation
abritrary
absail
absailing
absense
absolutly
absorbsion
absorbtion
abudance
abundacies
abundancies
abundunt
abutts
acadamy
acadmic
accademic
accademy
acccused
accelleration
acceptence
acceptible
accessable
acident
accidentaly
accidently
acclimitization
accomadate
accomadated
accomadates
accomadating
accomadation
accomadations
accomdate
accomodate
accomodated
accomodates
accomodating
accomodation
accomodations
accompanyed
accordeon
accordian
accoring
accoustic
accquainted
accrediation
accredidation
accross
accussed
acedemic
acheive
acheived
acheivement
acheivements
acheives
acheiving
acheivment
acheivments
achievment
achievments
achivement
achivements
acknowldeged
acknowledgeing
acommodate
acomplish


{'orchistration': 'orchestration',
 'sepnt': 'spent',
 'htis': 'this',
 'eachother': 'each other',
 'Thnak': 'thank',
 'undersootd': 'understood',
 'any thing': 'anything',
 'Chiciago': 'Chicago',
 'aswell': 'as well',
 'ahold': 'a hold',
 'amonut': 'amount',
 'arent': "aren't",
 'bteween': 'between',
 'abandonned': 'abandoned',
 'aberation': 'aberration',
 'abilityes': 'abilities',
 'abilties': 'abilities',
 'abilty': 'ability',
 'abondon': 'abandon',
 'abbout': 'about',
 'abotu': 'about',
 'abouta': 'about a',
 'aboutit': 'about it',
 'aboutthe': 'about the',
 'abscence': 'absence',
 'abondoned': 'abandoned',
 'abondoning': 'abandoning',
 'abondons': 'abandons',
 'aborigene': 'aborigine',
 'accesories': 'accessories',
 'accidant': 'accident',
 'abortificant': 'abortifacient',
 'abreviate': 'abbreviate',
 'abreviated': 'abbreviated',
 'abreviation': 'abbreviation',
 'abritrary': 'arbitrary',
 'absail': 'abseil',
 'absailing': 'abseiling',
 'absense': 'absence',
 'absolutly': 'absolute

In [166]:
for each in new_words.keys():
    if each in current_triggers:
        pass
        #print(f'avoided {each}')
    else:
        if new_words[each] in df['replace'].unique():
            old_index = df[df['replace'] == new_words[each]].index.max()

            old_trigger = df.at[old_index, 'trigger']
            old_replace = df.at[old_index, 'replace']
            old_trigger,old_replace

            if '[' in old_trigger:
                old_trigger = old_trigger.replace(']', '')
                old_trigger += f', "{each}"]'
                new_trigger = old_trigger
            else:
                new_trigger = f'["{old_trigger}","{each}"]'
            
            df.at[old_index, 'trigger=1'] = 0
            df.at[old_index, 'trigger'] = new_trigger
        else:
            df.loc[len(df)] = [each, new_words[each], 'true', 'true', 1]
        
df.sort_values('replace', inplace=True, key=lambda x: x.str.lower() if x.dtype == 'object' else x)
df = df.reset_index()
df.drop(['index'], axis=1, inplace=True)
df = assert_triggers(df)
## updating the file
process_dataframe(df, "package.yml")
df

Unnamed: 0,trigger,replace,propagate_case,word,trigger=1
0,ahold,a hold,true,true,1
1,alot,a lot,true,true,1
2,avengence,a vengeance,true,true,1
3,"[""abondon"", ""adbandon""]",abandon,true,true,0
4,"[""abondoned"", ""abandonned""]",abandoned,true,true,0
...,...,...,...,...,...
5124,"[""rwite"", ""wriet""]",write,true,true,0
5125,writter,writer,true,true,1
5126,writed,writhed,true,true,1
5127,Sionist,Zionist,true,true,1
