In [1]:
import openai
import pandas as pd
import numpy as np
import os
import json
import random
from dotenv import load_dotenv
import re

In [2]:
load_dotenv(".env")

True

In [3]:
openai.organization = os.environ.get("OPENAI_ORG")
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [4]:
ANNOTATION_MODEL = "gpt-4o"
language_list = ['HKG', 'THA', 'JPN', 'KOR', 'MYS', 'CHN', 'ENS', 'PAK']
#can also try other models as well 

In [5]:
def parse_filename(filename):
    pattern = r"SD_(\w+)_\d+_.*_(\d+)_([\w+]+)"
    match = re.match(pattern, filename)
    if match:
        language = match.group(1) 
        number = match.group(2)    
        chapter = match.group(3)   
        return language, number, chapter
    return None, None, None

def read_instruction(instruction_path):
    with open(instruction_path, 'r', encoding='utf-8') as file:
        instruction = file.read()
    return instruction

def replace_placeholder_with_text(original_string, text_string):
    return original_string.replace("{text}", text_string)

def format_prompt(system_instruction, instruction, input):
    #print(f"Text: {input}")
    #print(f"Instruction: {instruction}")
    message = replace_placeholder_with_text(instruction, input) 
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": message},
    ]
    return messages

def fix_json_trailing_commas(json_string):
    json_string = re.sub(r',\s*([\]}])', r'\1', json_string)
    return json_string

def request(samples_path, output_path, system_instruction, prompt_type):  # parameters
    for file_name in sorted(os.listdir(samples_path)):
        print(file_name)
        language, _, _ = parse_filename(file_name)
        if language not in language_list:
            continue
        if file_name.endswith('.txt'):
            file_path = os.path.join(samples_path, file_name)
            json_name = os.path.splitext(file_name)[0] + '.json'
            output_json_path = os.path.join(output_path, json_name)
            if os.path.exists(output_json_path):
                continue
            
            with open(file_path, 'r', encoding='utf-8') as file:
                sample = file.read()
                messages = format_prompt(system_instruction, prompt_type, sample)
                response = openai.chat.completions.create(
                    model=ANNOTATION_MODEL,
                    messages=messages,
                    temperature=0,
                )
                ans_model = response.choices[0].message.content
                print('===================')
                ans_model = ans_model[3:-3].strip()
                ans_model = ans_model.lstrip('json').strip()
                ans_model = ans_model.replace('}\n{', '},\n{')
                ans_model = "[" + ans_model + "]"
                ans_model = re.sub(r'\bTrue\b', 'true', ans_model)
                ans_model = re.sub(r'\bFalse\b', 'false', ans_model)
                ans_model = fix_json_trailing_commas(ans_model)
                #print(ans_model)
                try:
                    data = json.loads(ans_model)
                    print("Loading Successful")
                except Exception as e: 
                    print(ans_model)
                    print(f"Error in json: {e}")
                    continue
                with open(output_json_path, 'w', encoding='utf-8') as annot_file:
                    json.dump(data, annot_file, ensure_ascii=False, indent=4)
    return


In [6]:
GENERATE_MODEL = "ICNALE"
output_dir = f"../annotations/{GENERATE_MODEL}_output"
input_dir =  f"../data/{GENERATE_MODEL}"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
if not os.path.exists(input_dir):
    os.mkdir(input_dir)
    
system_instruction_path = "../lib/instructions/annotation_instructions/system_instruction.txt"
system_instruction = read_instruction(system_instruction_path)
assist_instruction_dir = "../lib/instructions/annotation_instructions/assist_instructions"

#print(system_instruction)

for instruction in os.listdir(assist_instruction_dir):
    instruction_path = os.path.join(assist_instruction_dir, instruction)
    assist_instruction = read_instruction(instruction_path)
    instruction_name = instruction.strip(".txt")
    output_path = os.path.join(output_dir, f"{GENERATE_MODEL}_generation_{instruction_name}")
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    for item in sorted(os.listdir(input_dir)):
        sample_path = os.path.join(input_dir, item)
        annotation_path = os.path.join(output_path, item)
        if not os.path.exists(annotation_path):
            os.mkdir(annotation_path)
        try:
            request(sample_path, annotation_path, system_instruction, assist_instruction)
        except Exception as e:
            print(f"Error: {e}")
            continue

SD_CHN_01_XXX_INT_xx_001_B1_2.txt
SD_CHN_01_XXX_INT_xx_002_B1_1.txt
SD_CHN_01_XXX_INT_xx_003_B2_0.txt
SD_CHN_01_XXX_INT_xx_004_B1_2.txt
SD_CHN_01_XXX_INT_xx_005_B1_2.txt
SD_CHN_01_XXX_INT_xx_006_B2_0.txt
SD_CHN_01_XXX_INT_xx_007_A2.0.txt
SD_CHN_01_XXX_INT_xx_008_B2_0.txt
SD_CHN_01_XXX_INT_xx_009_B2_0.txt
SD_CHN_01_XXX_INT_xx_010_B1_2.txt
SD_CHN_01_XXX_INT_xx_011_B1_1.txt
SD_CHN_01_XXX_INT_xx_012_B2_0.txt
SD_CHN_01_XXX_INT_xx_013_B1_2.txt
SD_CHN_01_XXX_INT_xx_014_B1_2.txt
SD_CHN_01_XXX_INT_xx_015_B1_1.txt
SD_CHN_01_XXX_INT_xx_016_B2_0.txt
SD_CHN_01_XXX_INT_xx_017_B2_0.txt
SD_CHN_01_XXX_INT_xx_018_B1_1.txt
SD_CHN_01_XXX_INT_xx_019_B2_0.txt
SD_CHN_01_XXX_INT_xx_020_B1_2.txt
SD_CHN_01_XXX_INT_xx_021_B1_2.txt
SD_CHN_01_XXX_INT_xx_022_B2_0.txt
SD_CHN_01_XXX_INT_xx_023_B1_2.txt
SD_CHN_01_XXX_INT_xx_024_B2_0.txt
SD_CHN_01_XXX_INT_xx_025_B2_0.txt
SD_CHN_01_XXX_INT_xx_026_B1_2.txt
SD_CHN_01_XXX_INT_xx_027_B1_1.txt
SD_CHN_01_XXX_INT_xx_028_B1_2.txt
SD_CHN_01_XXX_INT_xx_029_B1_1.txt
SD_CHN_01_XXX_