In [40]:
import pandas as pd
import numpy as np
from utils import data
import openai
import os
from absl import app, flags, logging
import time
import warnings
warnings.filterwarnings('ignore')
language = 'itl'
openai.api_key = os.getenv("OPENAI_API_KEY_2")

train_path = f"data/{language}/{language}.train"
dev_path = f"data/{language}/{language}.dev"
test_path = f"data/{language}/{language}.test"
tags_path = "data/sigmorphon_tags.csv"

In [41]:
tags = pd.read_csv(tags_path)
# tags = tags.drop_duplicates()
dictionary = tags.set_index('Label').T.to_dict('list')

In [42]:
data=pd.read_csv(train_path, sep='\t', header=None, 
                 names=["input", "output", "tags"])
data.loc[data['tags'] == 'N']


tags_dictionary_train = {k: list(map(tuple, g[['input','output']].values)) for k, g in data.groupby('tags')}

In [43]:
with open("prompts/inflection_train_base.txt") as handle:
    base_text = handle.read()
with open("prompts/inflection_test_base.txt") as handle:
    test_text = handle.read()

In [44]:
test_data=pd.read_csv(test_path, sep='\t', header=None, 
                 names=["input", "output", "tags"])
tags_dictionary_test = {k: list(map(tuple, g[['input','output']].values)) for k, g in test_data.groupby('tags')}

In [45]:
all_tags = []
for i in tags_dictionary_train:
    all_tags.append(i)
for i in tags_dictionary_test:
    all_tags.append(i)
all_tags = list(set(all_tags))
len(all_tags)

300

In [46]:
unique_tags = list(set("".join(all_tags).split(";")))
len(unique_tags)

131

In [47]:
unique_tags[:6]

['PRS', 'FUTN', '1V.MSDR', 'NO3S+BE3S', 'LOCV', 'NO3S+BE2P']

In [48]:
tags_dictionary_train["ADJ;FOC;SG"]

[('kɬәŋqsxi', 'kɬәŋqsxiˀin'), ('kčixlqɬ', 'kčixlqɬˀin')]

In [49]:
#GROUPED_TAGS:
GROUPED_TAGS = data.groupby("tags")

In [50]:
def get_human_readable(text):
    tags_datum = text.split(';')
    human = []
    for i in tags_datum:
        try:
            human.append(dictionary[i.lower()][0])
        except Exception:
            human.append(i)
    return ";".join(human)
get_human_readable('ADJ;LOC;SG')

'Adjective;LOC;Singular'

In [51]:
import random
def get_prompt(tag, train, test):
    prompt = []
    try:
        train_list = train[tag]
    except Exception:
        return 'cant index'
    
    if len(train_list) >= 10:
        train_list = random.sample(train_list, 10)
        for index, row in enumerate(train_list):
            
            prompt.append(base_text.format(language=language, inp=''.join(list(row[0])),
                              tags=get_human_readable(tag),
                              output=row[1]))
            
    if len(train_list) < 10:
        train_list = random.sample(train_list, len(train_list))
        for index, row in enumerate(train_list):
            prompt.append(base_text.format(language=language, inp=''.join(list(row[0])),
                              tags=get_human_readable(tag),
                              output=row[1]))
    
        train_list = GROUPED_TAGS.sample(n=1)[:10-len(train_list)]
        train_list = train_list.values.tolist()
        for index, row in enumerate(train_list):
            prompt.append(base_text.format(language=language, inp=''.join(list(row[0])),
                              tags=get_human_readable(row[2]),
                              output=row[1]))
    
        
    prompt.append(test_text.format(language=language, inp=''.join(list(test[0])), tags=get_human_readable(tag)))
    
    return "\n".join(prompt)

In [52]:
cant_index = 0
results = []
results_text = []
for index, tag in enumerate(tags_dictionary_test):
    for i in range(len(tags_dictionary_test[tag])):
        prompt = get_prompt(tag, tags_dictionary_train, tags_dictionary_test[tag][i])
        print(prompt)
        if prompt == "cant index":
            cant_index+=1
            continue
        try:
            response = openai.Completion.create(
                                engine="code-davinci-002",
                                prompt=prompt,
                                temperature=0.7,
                                max_tokens=100,
                                top_p=1,
                                frequency_penalty=0,
                                presence_penalty=0,
                                stop=["Q"]
                            )
            gold = tags_dictionary_test[tag][i][1]

            current_outputs = response["choices"][0]["text"]
            print(f"This is the current output: {current_outputs}\nThis is the gold: {gold}")
            print('\n')
            
            if gold == current_outputs:
                results.append(1)
            else:
                results.append(0)
            results_text.append((current_outputs, gold))
                
        except Exception as e:
            time.sleep(60)
            response = openai.Completion.create(
                                engine="code-davinci-002",
                                prompt=prompt,
                                temperature=0.5,
                                max_tokens=100,
                                top_p=1,
                                frequency_penalty=0,
                                presence_penalty=0,
                                stop=["Q"]
                            )
            gold = tags_dictionary_test[tag][i][1]

            current_outputs = response["choices"][0]["text"]
            print(f"This is the current output: {current_outputs}\nThis is the gold: {gold}")
            print('\n')
            
            if gold == current_outputs:
                results.append(1)
            else:
                results.append(0)
            
            results_text.append((current_outputs, gold))

        
        
print(f'cant index number: {cant_index}')

cant index
cant index
Q: Inflect the itl word 'kɬčl' with the morphological tags of Adjective;Plural
A: kɬčleˀn
Q: Inflect the itl word 'čʼevezlaχ' with the morphological tags of Adjective;Plural
A: čʼevezlaχaˀn
Q: Inflect the itl word 'ktve' with the morphological tags of Adjective;Plural
A: ktvelaχeˀn
Q: Inflect the itl word 'čʼačʼ' with the morphological tags of Adjective;Plural
A: čʼačʼalaχaˀn
Q: Inflect the itl word 'iˀɬuq' with the morphological tags of Adjective;Plural
A: iˀɬuqeˀn
Q: Inflect the itl word 'ekʼni' with the morphological tags of Adjective;Plural
A: ekʼniˀn
Q: Inflect the itl word 'kčixlqɬ' with the morphological tags of Adjective;Focus;Singular
A: kčixlqɬˀin
Q: Inflect the itl word 'čʼevezlaχ' with the morphological tags of Adjective;Plural
A: čʼevezlaχaˀn
Q: Inflect the itl word 'ul’u' with the morphological tags of Adjective;Singular
A: ul’ul’aχ
Q: Inflect the itl word 'iχɬ' with the morphological tags of Noun
A: iχɬčˀin
Q: Inflect the itl word 'kstʼaŋa' with the

In [54]:
results_tuple = []
results_ = []
for i in results_text:
    results_tuple.append((i[0].strip(), i[1]))
    if i[0].strip() == i[1]:
        results_.append(1)
    else:
        results_.append(0)


In [55]:
results_ = np.array(results_)
results_.mean()

0.32802547770700635