In [1]:
import openai
import pandas as pd
import numpy as np
import os
import json
import random
from dotenv import load_dotenv

In [2]:
load_dotenv(".env")

True

In [22]:
openai.organization = "University of Melbourne" 
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [5]:
ANNOTATION_MODEL = "gpt-4o"
#can also try other models as well 

In [6]:
GENERATION_MODEL_LIST = {
    "DeepSeek V3": {"model": "deepseek/deepseek-chat", 
                    "api_key": os.getenv("DEEPSEEK_API_KEY")},
    "QWEN 72B":{"model": "qwen/qwen-2.5-72b-instruct", 
                "api_key": os.getenv("QWEN_API_KEY")},
    "Llama 70B": {"model": "meta-llama/llama-3.3-70b-instruct",
                 "api_key": os.getenv("LLAMA_API_KEY")},
    "Claude 3.5 Haiku": {"model": "anthropic/claude-3.5-haiku", 
                        "api_key": os.getenv("CLAUDE_API_KEY")},
}

In [20]:
def extract_speaker_lines(input_dir, output_dir):
    for data_dir in os.listdir(input_dir):
        data_path = os.path.join(input_dir, data_dir)
        out_path = os.path.join(output_dir, data_dir)
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        for file_path in os.listdir(data_path):
            input_file_path = os.path.join(data_path, file_path)
            out_file_path = os.path.join(out_path, file_path)
            with open(input_file_path, 'r', encoding='utf-8') as file:
                conversation = file.read()
                lines = conversation.split("\n")
                extracted_lines = []
                for line in lines:
                    if "L2" in line or "Speaker B" in line:
                        dialogue = line.split("** ", 2)[-1].strip()
                        extracted_lines.append(dialogue)
                result = "\n".join(extracted_lines)
                with open(out_file_path, "w", encoding="utf-8") as ffile:
                    ffile.write(result)

def read_instruction(instruction_path):
    with open(instruction_path, 'r', encoding='utf-8') as file:
        instruction = file.read()
    return instruction

def replace_placeholder_with_text(original_string, text_string):
    return original_string.replace("{text}", text_string)

def format_prompt(system_instruction, instruction, input):
    #print(f"Text: {input}")
    #print(f"Instruction: {instruction}")
    message = replace_placeholder_with_text(instruction, input) 
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": message},
    ]
    return messages

def request(samples_path, output_path, system_instruction, prompt_type):  # parameters
    for file_name in sorted(os.listdir(samples_path)):
        print(file_name)
        if file_name.endswith('.txt'):
            file_path = os.path.join(samples_path, file_name)
            json_name = os.path.splitext(file_name)[0] + '.json'
            output_json_path = os.path.join(output_path, json_name)
            if os.path.exists(output_json_path):
                continue
            
            with open(file_path, 'r', encoding='utf-8') as file:
                sample = file.read()
                messages = format_prompt(system_instruction, prompt_type, sample)
                response = openai.chat.completions.create(
                    model=ANNOTATION_MODEL,
                    messages=messages,
                    temperature=0,
                )
                ans_model = response.choices[0].message.content
                print(ans_model)
                print('===================')
                ans_model = ans_model[3:-3].strip()
                ans_model = ans_model.lstrip('json').strip()
                # try:
                #     ans_model = json.loads(ans_model)
                # except json.JSONDecodeError as e:
                #     print(f"JSONDecodeError: {e}")
                ans_model = json.loads(ans_model)
                if os.path.exists(output_json_path):
                    with open(output_json_path, 'r', encoding='utf-8') as out_file:
                        try:
                            data = json.load(out_file)
                        except json.JSONDecodeError:
                            data = []
                else:
                    data = []
                data.extend(ans_model)

                with open(output_json_path, 'w', encoding='utf-8') as annot_file:
                    json.dump(data, annot_file, ensure_ascii=False, indent=4)
    return


In [23]:
GENERATE_MODEL = "QWEN 72B"
input_dir = f"../data/{GENERATE_MODEL}_output"
output_dir = f"../annotations/{GENERATE_MODEL}_output"
samples_dir =  f"../data/{GENERATE_MODEL}_output_intermediate"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
if not os.path.exists(samples_dir):
    os.mkdir(samples_dir)
extract_speaker_lines(input_dir, samples_dir)

system_instruction_path = "../lib/instructions/annotation_instructions/system_instruction.txt"
system_instruction = read_instruction(system_instruction_path)
assist_instruction_dir = "../lib/instructions/annotation_instructions/assist_instructions"

print(system_instruction)

for instruction in os.listdir(assist_instruction_dir):
    instruction_path = os.path.join(assist_instruction_dir, instruction)
    assist_instruction = read_instruction(instruction_path)
    instruction_name = instruction.strip(".txt")
    output_path = os.path.join(output_dir, f"{GENERATE_MODEL}_generation_{instruction_name}")
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    for item in sorted(os.listdir(samples_dir)):
        sample_path = os.path.join(samples_dir, item)
        annotation_path = os.path.join(output_path, item)
        if not os.path.exists(annotation_path):
            os.mkdir(annotation_path)
        request(sample_path, annotation_path, system_instruction, assist_instruction)

'''
You are a language expert specializing in doing text annotation in the English second language. You will be instructed to make annotations to a given dialogue texts based on some linguistics aspects to evaluate grammatical features in L2 texts.

The given text samples are from the English dialogue of second language speakers of English.  

Make sure to keep the annotation format without any change in passage when giving the annotation output.

You should annotate each L2 utterance if it has targeted grammatical feature. 
If the utterance has multiple tokens/phrases that need to be annotated, you should list all the token/phrase annotations separately as follows:
{
    "type": "the targeted grammatical feature",
    "annotation_utterance": "the utterance of annotation target"
    "annotation_tokens": [
        {"token": "the token of annotation target",
        "grammar correctness": the annotated grammar feature is aligned with the native English speaker's grammar usage. This can a

AuthenticationError: Error code: 401 - {'error': {'message': 'OpenAI-Organization header should match organization for API key', 'type': 'invalid_request_error', 'param': None, 'code': 'mismatched_organization'}}