In [59]:
import pandas as pd
import numpy as np
from utils import data
import openai
import os
from absl import app, flags, logging
import time
import warnings
from collections import Counter
from statistics import mean

warnings.filterwarnings('ignore')
language = 'itl'
openai.api_key = os.getenv("OPENAI_API_KEY_2")

train_path = f"data/{language}/{language}.train"
dev_path = f"data/{language}/{language}.dev"
test_path = f"data/{language}/{language}.test"
tags_path = "data/sigmorphon_tags.csv"

In [60]:
tags = pd.read_csv(tags_path)
# tags = tags.drop_duplicates()
dictionary = tags.set_index('Label').T.to_dict('list')

In [61]:
data=pd.read_csv(train_path, sep='\t', header=None, 
                 names=["input", "output", "tags"])


TAGS_DICTIONARY_TRAIN = {k: list(map(tuple, g[['input','output']].values)) for k, g in data.groupby('tags')}
TAGS_DICTIONARY_TRAIN['ADJ;FOC;SG']


[('kɬәŋqsxi', 'kɬәŋqsxiˀin'), ('kčixlqɬ', 'kčixlqɬˀin')]

In [62]:
with open("prompts/inflection_train_base.txt") as handle:
    base_text = handle.read()
with open("prompts/inflection_test_base.txt") as handle:
    test_text = handle.read()

In [63]:
test_data=pd.read_csv(test_path, sep='\t', header=None, 
                 names=["input", "output", "tags"])
test_data.head()

Unnamed: 0,input,output,tags
0,alvale,kʼalvaleknaˀn,V.CVB;NFIN
1,alvolt,tʼalvoltaɬsxen,V;NO1S+BE2P;FIN;IND;FUT
2,aŋaqe,aŋaqe,N;NOM;SG
3,aŋja,aŋjataznen,V;NO3S+AB3S;ITER;FIN;IND;PRS
4,aŋɬo,kʼaŋɬosx,V;NO2P+AB3S;FIN;IMP


In [67]:
TEMPS = [0.1, 0.2, 0.3, 0.4, 0.5]


In [68]:
all_tags = []
for i in TAGS_DICTIONARY_TRAIN:
    all_tags.append(i)
    
all_tags = list(set(all_tags))
len(all_tags)

259

In [69]:
unique_tags = list(set("".join(all_tags).split(";")))
len(unique_tags)

113

In [70]:
unique_tags[:6]

['2', 'CONDV', 'INS', 'OPTV', 'ABL', 'NO3S+BE2P']

In [71]:
TAGS_DICTIONARY_TRAIN["ADJ;FOC;SG"]

[('kɬәŋqsxi', 'kɬәŋqsxiˀin'), ('kčixlqɬ', 'kčixlqɬˀin')]

In [72]:
#GROUPED_TAGS:
GROUPED_TAGS = data.groupby("tags")

In [73]:
def get_human_readable(text):
    tags_datum = text.split(';')
    human = []
    for i in tags_datum:
        try:
            human.append(dictionary[i.lower()][0])
        except Exception:
            human.append(i)
    return ";".join(human)
get_human_readable('ADJ;LOC;SG')

'Adjective;LOC;Singular'

In [74]:
import random
def get_prompt(test_row, shuffled: bool, randomize: bool):
    prompt = []
    try:
        train_list = get_samples_with_same_tags(test_row['tags'])
    except Exception:
        train_list = get_samples_with_random_tags()
    
    if shuffled:
        random.shuffle(train_list)
    if randomize:
        train_list = get_samples_with_random_tags()
    

    for index, row in enumerate(train_list):
        try:
            tags = row[2]
        except Exception:
            tags = test_row['tags']
        prompt.append(base_text.format(language=language, inp=''.join(list(row[0])),
                              tags=get_human_readable(tags),
                              output=row[1]))
    
    
    prompt.append(test_text.format(language=language, inp=''.join(list(test_row['input'])), tags=get_human_readable(test_row['tags'])))

    
    return "\n".join(prompt)

In [84]:
# TEST_LOOP
f = open("itl_language.txt", "a")
f.write(f"Each prompt will be tested on these temps: {TEMPS}\n")
OUTPUTS = []
for index, row in test_data[:100].iterrows():
    
    potential_outputs = []
    tag = row['tags']
    print(tag)
    input_ = row['input']
    gold_output = row['output']
    prompt = get_prompt(row, False, False)
    print(prompt)
    print('\n\n')
    
    f.write(f"\nThis is the normal_prompt\n:{prompt}")
    #TEMPS
    for temp in TEMPS:
        try: 
            potential_outputs.append(get_open_ai(temp, prompt))
        
        except Exception:
            time.sleep(60)
            potential_outputs.append(get_open_ai(temp, prompt))
    
    
    # Shuffle
    prompt = get_prompt(row, True, False)
    f.write(f"This is the shuffled prompt\n:{prompt}")
    print(f'this is the suffled prompt: {prompt}')
    print('\n\n')
    for temp in TEMPS:
        try: 
            potential_outputs.append(get_open_ai(temp, prompt))
        
        except Exception:
            time.sleep(60)
            potential_outputs.append(get_open_ai(temp, prompt))
    
    
    #Try Completely Random prompts:
    prompt_random = get_prompt(row, False, True)
    f.write(f'this is the random prompt\n: {prompt_random}')
    
    print(f'this is the random prompt\n: {prompt_random}')
    print('\n\n')
    for temp in TEMPS:
        try: 
            potential_outputs.append(get_open_ai(temp, prompt_random))
        
        except Exception:
            time.sleep(60)
            potential_outputs.append(get_open_ai(temp, prompt))
    
    predicted = get_vote_result(potential_outputs)
    print(f'This is the Gold: {gold_output}\nThis is the predicted: {predicted}\nThese are all the potential outputs: {potential_outputs}')
    f.write(f'This is the Gold: {gold_output}\nThis is the predicted: {predicted}\nThese are all the potential outputs: {potential_outputs}')
    OUTPUTS.append((predicted, gold_output))
f.close()

V.CVB;NFIN
Q: Inflect the itl word 'eɬčku' with the morphological tags of Converb;Nonfinite
A: keɬčkuˀin
Q: Inflect the itl word 'әnta' with the morphological tags of Converb;Nonfinite
A: kәntaknen
Q: Inflect the itl word 'fi' with the morphological tags of Converb;Nonfinite
A: kfiknen
Q: Inflect the itl word 'ɬŋemnele' with the morphological tags of Converb;Nonfinite
A: kɬŋemneleˀan
Q: Inflect the itl word 'stil' with the morphological tags of Converb;Nonfinite
A: kstilˀin
Q: Inflect the itl word 'ənkmama' with the morphological tags of Converb;Nonfinite
A: kənkmamaknan
Q: Inflect the itl word 'fčet' with the morphological tags of Converb;Nonfinite
A: fčetkas
Q: Inflect the itl word 'kʼol' with the morphological tags of Converb;Nonfinite
A: kkʼolknen
Q: Inflect the itl word 'seŋ' with the morphological tags of Converb;Nonfinite
A: seŋkas
Q: Inflect the itl word 'čŋi' with the morphological tags of Converb;Nonfinite
A: kčŋiˀin
Q: Inflect the itl word 'alvale' with the morphological tag

In [87]:
#(predicted, gold)
results = []
for i in OUTPUTS:
    if i[0] == i[1]:
        results.append(1)
    else:
        results.append(0)
mean(results)

0.35

In [90]:
OUTPUTS

[('kalvalean', 'kʼalvaleknaˀn'),
 ('alvoltaɬsxen', 'tʼalvoltaɬsxen'),
 ('aŋaqe', 'aŋaqe'),
 ('aŋjateznen', 'aŋjataznen'),
 ('qʼaŋɬosx', 'kʼaŋɬosx'),
 ('katalin', 'atalka'),
 ('kaxtqzuknen', 'kʼaxtqzoˀan'),
 ('azzank', 'azzank'),
 ('babuskank', 'babuskank'),
 ('kәnčajeˀin', 'čajkit'),
 ('čajnikta', 'čajnikta'),
 ('čʼanzoˀan', 'čʼanzol'),
 ('tčankzoqzokičen', 'tčankzoqzon'),
 ('čaqaˀɬqzaxen', 'čaqaˀɬqsuˀin'),
 ('čaqolenk', 'čaqolenk'),
 ('xčavaxč', 'qčavaqzoxčeˀn'),
 ('mčʼekičen', 'mčʼekičen'),
 ('nčʼeqas', 'mәnčʼekas'),
 ('qčʼexč', 'qčʼexč'),
 ('kčeˀɬqzuknen', 'kčeˀɬqsuˀin'),
 ('čʼelen', 'čʼelisčiŋnen'),
 ('čʼelqzuznen', 'čʼelqzuznen'),
 ('kčʼeˀɬqzuknen', 'kčʼeˀɬqzuˀin'),
 ('kčʼemxlaɬčen', 'čʼemxlaqzoveˀn'),
 ('čʼet', 'čʼet'),
 ('čiskipen', 'čiskipnen'),
 ('kčilknen', 'kčilknen'),
 ('әnčʼiɬinen', 'enčʼiɬivnen'),
 ('kčʼiɬiknen', 'kčʼiɬiknen'),
 ('čʼiɬilatesč', 'øčʼiɬilatesč'),
 ('čiˀŋiɬknen', 'čiˀŋiɬknen'),
 ('čistotka', 'čistotka'),
 ('tčkixen', 'tčkivaxen'),
 ('čʼmilktezneˀn', 'čʼmilka

In [82]:
def get_human_readable(text):
    tags_datum = text.split(';')
    human = []
    for i in tags_datum:
        try:
            human.append(dictionary[i.lower()][0])
        except Exception:
            human.append(i)
    return ";".join(human)
print(f'Converted ADJ;LOC;SG to: {get_human_readable("ADJ;LOC;SG")}')


def get_samples_with_same_tags(tags: str) -> list[tuple]:
    same_tags = TAGS_DICTIONARY_TRAIN[tags]
    if len(same_tags) > 10:
        return random.sample(same_tags, k=10)
    else:
        list_of_samples = []
        l = data.sample(n=10-len(same_tags)).values.tolist()
        for i in l:
            list_of_samples.append(i)
        h = random.sample(same_tags, k=len(same_tags))
        for i in h:
            list_of_samples.append(i)
        return list_of_samples

def get_samples_with_random_tags() -> str:
    list_of_samples = []
    l = data.sample(n=10).values.tolist()
    for i in l:
        list_of_samples.append(i)
    return list_of_samples

def get_vote_result(results: list[str]) -> str:
    data = Counter(results)
    return max(results, key=data.get)

def get_open_ai(temp: float, prompt: str) -> str:
    response = openai.Completion.create(
                                engine="code-davinci-002",
                                prompt=prompt,
                                temperature=temp,
                                max_tokens=100,
                                top_p=1,
                                frequency_penalty=0,
                                presence_penalty=0,
                                stop=["Q"])
    return response["choices"][0]["text"].strip()

Converted ADJ;LOC;SG to: Adjective;LOC;Singular
