In [1]:
import openai
import pandas as pd
import numpy as np
import os
import json
import random
from dotenv import load_dotenv
import re

In [2]:
load_dotenv(".env")

True

In [3]:
openai.organization = os.environ.get("OPENAI_ORG")
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [4]:
ANNOTATION_MODEL = "gpt-4o"
#can also try other models as well 

In [5]:
GENERATION_MODEL_LIST = {
    "DeepSeek V3": {"model": "deepseek/deepseek-chat", 
                    "api_key": os.getenv("DEEPSEEK_API_KEY")},
    "QWEN 72B":{"model": "qwen/qwen-2.5-72b-instruct", 
                "api_key": os.getenv("QWEN_API_KEY")},
    "Llama 70B": {"model": "meta-llama/llama-3.3-70b-instruct",
                 "api_key": os.getenv("LLAMA_API_KEY")},
    "Claude 3.5 Haiku": {"model": "anthropic/claude-3.5-haiku", 
                        "api_key": os.getenv("CLAUDE_API_KEY")},
    "GPT 4o": {"model": "gpt-4o", 
               "api_key": os.getenv("OPENAI_API_KEY")}
}

In [6]:
def extract_speaker_lines(input_dir, output_dir):
    for data_dir in os.listdir(input_dir):
        data_path = os.path.join(input_dir, data_dir)
        out_path = os.path.join(output_dir, data_dir)
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        for file_path in os.listdir(data_path):
            input_file_path = os.path.join(data_path, file_path)
            out_file_path = os.path.join(out_path, file_path)
            with open(input_file_path, 'r', encoding='utf-8') as file:
                conversation = file.read()
                lines = conversation.split("\n")
                extracted_lines = []
                for line in lines:
                    if "L2" in line or "Speaker B" in line:
                        dialogue = line.split("** ", 2)[-1].strip()
                        extracted_lines.append(dialogue)
                result = "\n".join(extracted_lines)
                with open(out_file_path, "w", encoding="utf-8") as ffile:
                    ffile.write(result)

def read_instruction(instruction_path):
    with open(instruction_path, 'r', encoding='utf-8') as file:
        instruction = file.read()
    return instruction

def replace_placeholder_with_text(original_string, text_string):
    return original_string.replace("{text}", text_string)

def format_prompt(system_instruction, instruction, input):
    #print(f"Text: {input}")
    #print(f"Instruction: {instruction}")
    message = replace_placeholder_with_text(instruction, input) 
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": message},
    ]
    return messages

def fix_json_trailing_commas(json_string):
    json_string = re.sub(r',\s*([\]}])', r'\1', json_string)
    return json_string

def request(samples_path, output_path, system_instruction, prompt_type):  # parameters
    for file_name in sorted(os.listdir(samples_path)):
        print(file_name)
        if file_name.endswith('.txt'):
            file_path = os.path.join(samples_path, file_name)
            json_name = os.path.splitext(file_name)[0] + '.json'
            output_json_path = os.path.join(output_path, json_name)
            if os.path.exists(output_json_path):
                continue
            
            with open(file_path, 'r', encoding='utf-8') as file:
                sample = file.read()
                messages = format_prompt(system_instruction, prompt_type, sample)
                response = openai.chat.completions.create(
                    model=ANNOTATION_MODEL,
                    messages=messages,
                    temperature=0,
                )
                ans_model = response.choices[0].message.content
                print('===================')
                ans_model = ans_model[3:-3].strip()
                ans_model = ans_model.lstrip('json').strip()
                ans_model = ans_model.replace('}\n{', '},\n{')
                ans_model = "[" + ans_model + "]"
                ans_model = re.sub(r'\bTrue\b', 'true', ans_model)
                ans_model = re.sub(r'\bFalse\b', 'false', ans_model)
                ans_model = fix_json_trailing_commas(ans_model)
                #print(ans_model)
                try:
                    data = json.loads(ans_model)
                    print("Loading Successful")
                except Exception as e: 
                    print(f"Error in json: {e}")
                    continue
                with open(output_json_path, 'w', encoding='utf-8') as annot_file:
                    json.dump(data, annot_file, ensure_ascii=False, indent=4)
    return


In [7]:
GENERATE_MODEL = "QWEN 72B"
input_dir = f"../data/{GENERATE_MODEL}_output"
output_dir = f"../annotations/{GENERATE_MODEL}_output"
samples_dir =  f"../data/{GENERATE_MODEL}_output_intermediate"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
if not os.path.exists(samples_dir):
    os.mkdir(samples_dir)
extract_speaker_lines(input_dir, samples_dir)

system_instruction_path = "../lib/instructions/annotation_instructions/system_instruction.txt"
system_instruction = read_instruction(system_instruction_path)
assist_instruction_dir = "../lib/instructions/annotation_instructions/assist_instructions"

#print(system_instruction)

for instruction in os.listdir(assist_instruction_dir):
    instruction_path = os.path.join(assist_instruction_dir, instruction)
    assist_instruction = read_instruction(instruction_path)
    instruction_name = instruction.strip(".txt")
    output_path = os.path.join(output_dir, f"{GENERATE_MODEL}_generation_{instruction_name}")
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    for item in sorted(os.listdir(samples_dir)):
        sample_path = os.path.join(samples_dir, item)
        annotation_path = os.path.join(output_path, item)
        if not os.path.exists(annotation_path):
            os.mkdir(annotation_path)
        request(sample_path, annotation_path, system_instruction, assist_instruction)

Cultural Exchange and Traditions_0.txt
Cultural Exchange and Traditions_1.txt
Cultural Exchange and Traditions_2.txt
Cultural Exchange_0.txt
Cultural Exchange_1.txt
Cultural Exchange_2.txt
Daily Conversations_0.txt
Daily Conversations_1.txt
Daily Conversations_2.txt
Daily Life_0.txt
Daily Life_1.txt
Daily Life_2.txt
Education_0.txt
Education_1.txt
Education_2.txt
Educational Settings_0.txt
Educational Settings_1.txt
Educational Settings_2.txt
Entertainment_0.txt
Entertainment_1.txt
Entertainment_2.txt
Health and Wellness_0.txt
Health and Wellness_1.txt
Health and Wellness_2.txt
Personal Goals and Experiences_0.txt
Personal Goals and Experiences_1.txt
Personal Goals and Experiences_2.txt
Problem-Solving and Conflict Resolution_0.txt
Problem-Solving and Conflict Resolution_1.txt
Problem-Solving and Conflict Resolution_2.txt
Problem-Solving_0.txt
Problem-Solving_1.txt
Problem-Solving_2.txt
Shopping and Services_0.txt
Shopping and Services_1.txt
Shopping and Services_2.txt
Shopping_0.txt
S