### Setting **OPENAI** keys

In [None]:
import openai

API_KEY = "********"

from openai import OpenAI

client = OpenAI(api_key=API_KEY)

In [None]:
from src.task.sofc import SOFCData
from src.task.matscholar import MatScholarData
from transformers import AutoTokenizer

mode = "sofc_slot"
mode = "sofc"
mode = "matscholar"


if mode == 'sofc_slot':
    data = SOFCData(is_slot=True)
elif mode == 'sofc':
    data = SOFCData(is_slot=False)
elif mode == 'matscholar':
    data = MatScholarData()

len(data.train['tokens'])

4406

In [None]:
from random import randint

augmented = {'tokens':[], 'index':[]}
for sentence, labels in zip(data.train['tokens'], data.train['labels']):
    i = randint(0, len(sentence)-1)
    if sum(labels):
        while labels[i] == 0:
            i = randint(0, len(sentence)-1)

    newsent = sentence[:i] + ['[MASK]'] + sentence[i+1:]

    augmented['tokens'].append(newsent)
    augmented['index'].append(i)


In `label_decode` , we tag a simple description for each entity.
We need to give information about what the entity

In [None]:
if mode == 'sofc_slot':
    def label_decode(label):
        if label == 'O':
            return "general word(non-entity)."
        elif label.endswith('anode_material'):
            return "anode in fuel cell."
        elif label.endswith("cathode_material"):
            return "cathode in fuel cell."
        elif label.endswith("conductivity"):
            return "value or unit for electrical conductivity."
        elif label.endswith("current_density"):
            return "value or unit for electrical current density."
        elif label.endswith("degradation_rate"):
            return "value or unit for degradation rate in fuel cell."
        elif label.endswith("device"):
            return "fuel cell itself."
        elif label.endswith("electrolyte_material"):
            return "electrolyte in fuel cell."
        elif label.endswith("fuel_used"):
            return "fuel or oxidant species."
        elif label.endswith("interlayer_material"):
            return "film between layer in fuel cell."
        elif label.endswith("open_circuit_voltage"):
            return "value or unit for OCV(open circuit voltage)."
        elif label.endswith("power_density"):
            return "value of electrical power density."
        elif label.endswith("resistance"):
            return "value or unit for electrical resistance."
        elif label.endswith("support_material"):
            return "electrocatalyst support in fuel cell."
        elif label.endswith("thickness"):
            return "value for unit for thickness."
        elif label.endswith("time_of_operation"):
            return "value or unit for duration of reaction."
        elif label.endswith("voltage"):
            return "value or unit for voltage."
        elif label.endswith("working_temperature"):
            return "value or unit for temperature in reaction."
elif mode == 'sofc':
    def label_decode(label):
        if label == 'O':
            return "general word(non-entity)."
        elif label.endswith('DEVICE'):
            return "fuel cell itself."
        elif label.endswith("EXPERIMENT"):
            return "cathode in fuel cell."
        elif label.endswith("MATERIAL"):
            return "materials."
        elif label.endswith("VALUE"):
            return "value or unit."
elif mode == 'matscholar':
    def label_decode(label):
        if label == 'O':
            return "general word(non-entity)."
        elif label.endswith('APL'):
            return "applications."
        elif label.endswith("CMT"):
            return "characterization methods."
        elif label.endswith("DSC"):
            return "descriptors."
        elif label.endswith("MAT"):
            return "materials."
        elif label.endswith("PRO"):
            return "properties."
        elif label.endswith("SMT"):
            return "synthesis methods."
        elif label.endswith("SPL"):
            return "symmetry phase labels."



In [None]:
id2label = {i:label_decode(l) for l, i in data.label2id.items()}

#with open("queries.txt", 'w') as f:
#    0
queries = []
for i in range(len(augmented['tokens'])):

    masked_idx  = augmented['index'][i]
    original_tok = data.train['tokens'][i][masked_idx]
    connected_label = data.train['labels'][i][masked_idx]

    item = {}
    item['line'] = " ".join(augmented['tokens'][i])
    #Your are a Materials Science Engineer.
    item['help'] = "Fill the [MASK] word from given sentence with top-6 prediction.\n" +\
                f"The prediction should be a kind of {id2label[connected_label]}\n" +\
                "Do not say anything like  \"Here is the extracted\", just output the json of the format {\"prediction\": list of word prediction}"
    item['tokens'] = augmented['tokens'][i]
    item['labels'] = data.train['labels'][i]
    item['added_idx'] = masked_idx
    item['original'] = original_tok
    queries.append(item)

    masked_text = " ".join(augmented['tokens'][i])
    masked_idx  = augmented['index'][i]
    original_tok = data.train['tokens'][i][masked_idx]
    connected_label = data.train['labels'][i][masked_idx]


    #print(masked_text)
    #print(f"The answer is : {original_tok}")
    #out = pipe(masked_text)
    #out_toks = [o['token_str'] for o in out if o['token_str'] != original_tok ]
    #print(f"The prediction is : {out_toks[:5]}")

    #print(f"Original Token : {original_tok}")
    #print(f"Augmented Token : {out_toks[:5]}")
    #print(f"Connected Label : {id2label[connected_label]}")
    #####
    #f.write(f"Question {i+1}. The original answer is [{original_tok}].\n")
    #f.write("Your are a Materials Science Engineer. Fill the [MASK] word from given sentence with top-6 prediction.\n")
    #f.write(f"The prediction should be a kind of {id2label[connected_label]}\n")
    #f.write(f"Input sentence is :\n{masked_text}\n\n")

In [None]:
def get_completion(prompt, system:str=None, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    if system:
        messages.append({"role": "system", "content":system})

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=1,
    )
    return response

In [None]:
import json
import re



i = 0
while i < len(queries):
    item = queries[i]
    response = get_completion(item['line'], item['help'])

    txt = response.choices[0].message.content

    txt = txt.replace("\n", "")
    txt = txt.replace("`", "")
    txt = txt.replace("json", "")
    txt = txt.replace("predictions", "prediction")


    item['prediction'] = json.loads(txt)['prediction']

    queries[i] = item
    i += 1


In [None]:
with open("matsch_GPT35_base/augmented.json", 'w') as f:
    json.dump(queries, f,)

### process to train.txt

In [None]:
import json
from src.task.sofc import SOFCData
from src.task.matscholar import MatScholarData

mode = 'sofc'

if mode == 'sofc_slot':
    label2id = SOFCData(is_slot=True).label2id
elif mode == 'sofc':
    label2id = SOFCData(is_slot=False).label2id
elif mode == 'matscholar':
    label2id = MatScholarData().label2id

id2label = {v:k for k, v in label2id.items()}
with open("Slot_GPT35_base/augmented.json", 'r') as f:
    queries = json.load(f,)

In [None]:
len(queries), len(id2label)

(611, 9)

In [None]:
id2label

{0: 'O',
 1: 'B-DEVICE',
 2: 'I-DEVICE',
 3: 'B-EXPERIMENT',
 4: 'I-EXPERIMENT',
 5: 'B-MATERIAL',
 6: 'I-MATERIAL',
 7: 'B-VALUE',
 8: 'I-VALUE'}

In [None]:
import random
rounds = 2
with open("./Sofc_GPT35_base/generated.txt", 'w') as f:

    for item in queries:

        item['prediction'], item['original']
        assert len(item['tokens']) == len(item['labels'])

        aug = [pred for pred in item['prediction'] if pred != item['original']][:5]
        aug = random.sample(aug, min(len(aug), rounds))

        tokens = item['tokens']
        labels = item['labels']

        added_idx = item['added_idx']
        aug_label = labels[added_idx]

        for sampled in aug:
            sampled = sampled.split()

            aug_labels = [aug_label] * len(sampled)
            aug_labels = [l+1 if i and l%2 else l for i, l in enumerate(aug_labels)]

            tokens_new = tokens[:added_idx] + sampled + tokens[added_idx+1:]
            labels_new = labels[:added_idx] + aug_labels + labels[added_idx+1:]
            labels_new = [id2label[id] for id in labels_new]

            for tok, lab in zip(tokens_new, labels_new):
                f.write(f"{tok} {lab}\n")
            f.write("\n")


In [None]:
with open('./matsch_GPT35_base/train.txt', 'w') as f:
    with open('./data/task/matscholar/train.txt', 'r') as g:
        lines = g.readlines()
    f.writelines(lines)

    with open('./matsch_GPT35_base/generated.txt', 'r') as g:
        lines = g.readlines()
    f.writelines(lines)
