In [111]:
import json
import os
import pandas as pd
from tqdm import tqdm
from openai import OpenAI, AzureOpenAI

In [112]:
client = AzureOpenAI(
            azure_endpoint="https://cullmsouthindia.openai.azure.com/",
            api_key="037155e1b16a432fa836637370eca0e3",
            api_version="2024-02-15-preview",
        )

In [113]:

def read_json_files(directory):
    # List to hold data from all JSON files
    all_data = []
    
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            
            with open(filepath, 'r') as file:
                data = json.load(file)
                all_data.append(data)
    
    return all_data

In [114]:
dataset=read_json_files('/Users/ramnarayanchoudhary/Desktop/UGRIP/updated_ugrip24-ling/ugrip24-ling/datasets/PuzzLing/data_public_data_dev')
dataset[0]

{'source_language': 'dyirbal',
 'target_language': 'english',
 'meta': 'ŋ = ng in hang. ɲ ≈ ni in onion; ȷ is a stop (as d) articulated in the same place in the mouth as ɲ.',
 'train': [['bayi yaɽa ŋunȷaymuŋa baŋgu gurugugu biŋgunman.',
   'Booze is making the man that is always being blamed tired.'],
  ['balan yabu bimabanȷalŋaymuŋa baŋgul yaɽaŋgu guliŋgu ŋunȷaɲu.',
   'The strong man is blaming the mother that is always following death adders.'],
  ['balan waymin bambun baŋgu ȷugaŋgu ȷamiman.',
   'Sugar is making the healthy mother-in-law fat.'],
  ['bala yila wura baŋgul bargandu biŋgundu guniɲu.',
   'The tired wallaby is searching for the little feather.'],
  ['balan malayigara baŋgu garandu biŋgunman.',
   'The smoke is making the scorpion tired.'],
  ['bala gurugu baŋgul ŋumaŋgu munduŋgu dimbaɲu.',
   'The offended father is carrying the booze.'],
  ['bayi midin baŋgun bimaŋgu malayigaraguninaymuŋagu banȷan.',
   'The death adder that is always searching for scorpions is follow

In [126]:
def question_data_to_puzzle_prompt(core_question_data):
    BASE_PROMPT = """Please translate the following statements in <LANG> and English to English and <LANG> respectively.

<QUESTIONS>

Instruction: Each translation pair should be a dictionary with "<LANG>" and "English" keys, and multiple pairs should be within a list called "answers". Ensure the response does not include ```json, ```plaintext, or the word "answer" as these cause issues. Only use the QUESTIONS section for response generation, without including any other question-answer information. The number of responses should match the number of QUESTIONS provided, and the sequence of responses should follow the input QUESTIONS correctly, with the source language always first in the response.
"""

    source_language = core_question_data["source_language"]
    train_data = core_question_data["train"]
    test_data = core_question_data["test"]
    
    data_pairs = []
    for pair in train_data:
        data_pairs.append(f"\t\"{source_language}\": \"{pair[0]}\"\n\t\"english\": \"{pair[1]}\"")
    
    question_pairs = []
    for pair in test_data:
        if pair[2] == ">":  # Translation from source_language to English
            question_pairs.append(f"\t\"{source_language}\": \"{pair[0]}\"")
        elif pair[2] == "<":  # Translation from English to source_language
            question_pairs.append(f"\t\"english\": \"{pair[1]}\"")

    return BASE_PROMPT.replace("<LANG>", source_language).replace("<DATA>", ',\n\n'.join(data_pairs)).replace("<QUESTIONS>", ',\n'.join(question_pairs))


In [116]:
for i, data in enumerate(dataset): # please be carefull about the use of the for loop 
    print(f"Puzzle prompt for dataset entry {i}:\n")
    print(question_data_to_puzzle_prompt(data))
    print("\n" + "="*80 + "\n")
    


Puzzle prompt for dataset entry 0:

Please translate the following statements in dyirbal and English to English and dyirbal respectively.

	"dyirbal": "balan ɲalŋga baŋgul ŋumaŋgu guniymuŋagu bambunman.",
	"dyirbal": "bala diban bilmbalmuŋa baŋgun biɲȷiriɲȷu guniɲu.",
	"dyirbal": "bayi bargan baŋgul yaɽaŋgu gubimbuluŋunȷanaymuŋagu banȷan.",
	"english": "The little wallaby is looking at the dragonfly.",
	"english": "The aunt that is always being followed is bending the feather.",
	"english": "The sleeping possum is ignoring the loud noise.",
	"english": "The caterpillar is searching for the man that is always carrying stones."

Instruction: Each translation pair should be a dictionary with "dyirbal" and "English" keys, and multiple pairs should be within a list called "answers". Ensure the response does not include ```json, ```plaintext, or the word "answer" as these cause issues. Only use the QUESTIONS section for response generation, without including any other question-answer informa

In [127]:
def chat(prompt, model):
    # Ensure the API key is retrieved from the environment variable
    # AZURE_OPENAI_API_KEY = "037155e1b16a432fa836637370eca0e3"
    # api_key = "037155e1b16a432fa836637370eca0e3"
    # if api_key is None:
    #     raise ValueError("AZURE_OPENAI_API_KEY environment variable is not set")
    client = AzureOpenAI(
            azure_endpoint="https://cullmsouthindia.openai.azure.com/",
            api_key="037155e1b16a432fa836637370eca0e3",
            api_version="2024-02-15-preview",
        )
    
    message_text = [{"role": "system", "content": "You are a helpful assistant."}, 
                    {"role": "user", "content": prompt}
                    ]
    completion = client.chat.completions.create(
        model=model,  # model = "deployment_name"
        messages=message_text,
        temperature=0,
        max_tokens=4096,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return completion

In [128]:
def fetch_solution_text(core_question_data):
    prompt = question_data_to_puzzle_prompt(core_question_data)
    model = "gpt4"

    try:
        
        response = chat(prompt, model)
        result = response.choices[0].message.content.strip()
        return result

    except Exception as e:
        print(f"Error: {e}")
        return ""

In [129]:
solution_texts = []

for data in dataset:
    raw_text = fetch_solution_text(data)
    solution_texts.append(raw_text)

for i, text in enumerate(solution_texts):
    print(f"Solution text for dataset {i+1}:")
    print(text)

Solution text for dataset 1:
[
    {
        "dyirbal": "balan ɲalŋga baŋgul ŋumaŋgu guniymuŋagu bambunman.",
        "English": "The man is hitting the dog with a stick."
    },
    {
        "dyirbal": "bala diban bilmbalmuŋa baŋgun biɲȷiriɲȷu guniɲu.",
        "English": "The woman is giving the child some water."
    },
    {
        "dyirbal": "bayi bargan baŋgul yaɽaŋgu gubimbuluŋunȷanaymuŋagu banȷan.",
        "English": "The man is standing on the mountain."
    },
    {
        "English": "The little wallaby is looking at the dragonfly.",
        "dyirbal": "bayi yabuŋgu baŋgul yaraŋgu gudaŋaymuŋagu bambun."
    },
    {
        "English": "The aunt that is always being followed is bending the feather.",
        "dyirbal": "bala diban duguŋguŋgu baŋgul yabuŋgu gudabanyjilŋunȷanaymuŋagu bambun."
    },
    {
        "English": "The sleeping possum is ignoring the loud noise.",
        "dyirbal": "bayi yabuŋgu baŋgul yaraŋgu gudabanyjilŋunȷanaymuŋagu bambun."
    },
    {
      

In [130]:
dataset_ref=read_json_files('/Users/ramnarayanchoudhary/Desktop/UGRIP/updated_ugrip24-ling/ugrip24-ling/datasets/PuzzLing/data_public_reference_data_dev')


In [131]:
def extract_content(entry):
    # Use regex to extract the content part of the string
    match = re.search(r':\s*"([^"]*)"', entry)
    if match:
        # Return the extracted content
        return match.group(1)
    else:
        return "No content found"

In [132]:
import re

def extract_source_language(entry):
    match = re.match(r'^\s*"(\w+)":', entry)
    if match:
        # Return the source language (convert to lower case for consistency)
        return match.group(1).lower()
    else:
        return "No source language found"

In [133]:
def extract_content_by_language(entry, t_language):
    # Normalize the case of the keys to avoid case mismatch issues
    t_language = t_language.lower()
    entry = {k.lower(): v for k, v in entry.items()}
    
  
    if t_language in entry:
        return entry[t_language]
    else:
        return "Content not found"

In [134]:

res = []
idx=-1

# Iterate through the dataset_ref
for  d in dataset_ref:
    
    
    source_language = d["source_language"]
    print("i_s",source_language)
    target_language = d["target_language"]
    test_data = d["test"]
    solution_list = json.loads(solution_texts[idx+1])
    idx=idx+1
    print("s_l",solution_list)

    question_pairs = []
    solution_pairs = []
    for pair in test_data:
        if pair[2] == ">":  # Translation from source_language to English
            question_pairs.append(f"\"{source_language}\": \"{pair[0]}\"")
        elif pair[2] == "<":  # Translation from English to source_language
            question_pairs.append(f"\"english\": \"{pair[1]}\"")
    
    for pair in test_data:
        if pair[2] == ">":  # Translation from source_language to English
            solution_pairs.append(pair[1])
        elif pair[2] == "<":  # Translation from English to source_language
            solution_pairs.append(pair[0])
    print("q_p", question_pairs)
    print("s_p",solution_pairs)
    # Iterate through the question_pairs and solution_list
    for q, s in zip(question_pairs, solution_list):
        print("this is q which is  going inside",q)
        q_content = extract_content(q)
        print("q_ins:",q_content)
        s_language = extract_source_language(q)
        print("s_la_in",s_language)
        if s_language == source_language:
            t_language = target_language
        else:
            t_language = source_language
        print("t_la_in",t_language)
        A_s = solution_pairs[question_pairs.index(q)]  # Actual solution from the test dataset
        print("the Actual solution :",A_s )
        print("s inside the block",s )
        # print("this is try",s.get(t_language))
        G_s=extract_content_by_language(s,t_language)
        print("the gpt Generated solution :",G_s )

        # Create a row for the DataFrame
        row = {
            "source_lang": s_language,
            "Question": q_content,
            "target_lang": t_language,
            "Actual_answer": A_s,
            "Generated_answer": G_s
        }
        res.append(row)

# Create a DataFrame from the result list
res_df = pd.DataFrame(res)

# Display the DataFrame
print(res_df)


# there is little problem in madak solution pair 

i_s dyirbal
s_l [{'dyirbal': 'balan ɲalŋga baŋgul ŋumaŋgu guniymuŋagu bambunman.', 'English': 'The man is hitting the dog with a stick.'}, {'dyirbal': 'bala diban bilmbalmuŋa baŋgun biɲȷiriɲȷu guniɲu.', 'English': 'The woman is giving the child some water.'}, {'dyirbal': 'bayi bargan baŋgul yaɽaŋgu gubimbuluŋunȷanaymuŋagu banȷan.', 'English': 'The man is standing on the mountain.'}, {'English': 'The little wallaby is looking at the dragonfly.', 'dyirbal': 'bayi yabuŋgu baŋgul yaraŋgu gudaŋaymuŋagu bambun.'}, {'English': 'The aunt that is always being followed is bending the feather.', 'dyirbal': 'bala diban duguŋguŋgu baŋgul yabuŋgu gudabanyjilŋunȷanaymuŋagu bambun.'}, {'English': 'The sleeping possum is ignoring the loud noise.', 'dyirbal': 'bayi yabuŋgu baŋgul yaraŋgu gudabanyjilŋunȷanaymuŋagu bambun.'}, {'English': 'The caterpillar is searching for the man that is always carrying stones.', 'dyirbal': 'bayi bagan baŋgul yaraŋgu gudaŋaymuŋagu bambun.'}]
q_p ['"dyirbal": "balan ɲalŋga 

In [135]:
res_df.head()

Unnamed: 0,source_lang,Question,target_lang,Actual_answer,Generated_answer
0,dyirbal,balan ɲalŋga baŋgul ŋumaŋgu guniymuŋagu bambun...,english,The father that is always being searched for i...,The man is hitting the dog with a stick.
1,dyirbal,bala diban bilmbalmuŋa baŋgun biɲȷiriɲȷu guniɲu.,english,The lizard is searching for the stone that is ...,The woman is giving the child some water.
2,dyirbal,bayi bargan baŋgul yaɽaŋgu gubimbuluŋunȷanaymu...,english,The man that is always blaming doctors is foll...,The man is standing on the mountain.
3,english,The little wallaby is looking at the dragonfly.,dyirbal,bayi yiriɲȷila baŋgul bargandu wuraŋgu buɽan.,bayi yabuŋgu baŋgul yaraŋgu gudaŋaymuŋagu bambun.
4,english,The aunt that is always being followed is bend...,dyirbal,bala yila baŋgun mugunanȷagu banȷalmuŋagu waɽu...,bala diban duguŋguŋgu baŋgul yabuŋgu gudabanyj...


In [136]:
res_df["score_exact_match"] = [g.strip()==a.strip() for g, a in zip(res_df["Generated_answer"],res_df["Actual_answer"])]

In [137]:
res_df.head

<bound method NDFrame.head of    source_lang                                           Question target_lang  \
0      dyirbal  balan ɲalŋga baŋgul ŋumaŋgu guniymuŋagu bambun...     english   
1      dyirbal   bala diban bilmbalmuŋa baŋgun biɲȷiriɲȷu guniɲu.     english   
2      dyirbal  bayi bargan baŋgul yaɽaŋgu gubimbuluŋunȷanaymu...     english   
3      english    The little wallaby is looking at the dragonfly.     dyirbal   
4      english  The aunt that is always being followed is bend...     dyirbal   
..         ...                                                ...         ...   
84     luiseño                           ʂuŋa:liʂuto:wqhu:ʔunikat     english   
85     english                   Is (the/a) teacher (the/a) liar?     luiseño   
86     english                (The/A) teacher sees (the/a) woman.     luiseño   
87     english                  This girl does not see my father.     luiseño   
88     english                                       Who is good?     luiseño  

In [140]:
len(res_df)

89

In [138]:
print(res_df["score_exact_match"].mean())

0.11235955056179775


In [139]:
res_df.to_csv("puzzling_baseline_eval_v3_gpt4_1.csv",index=False)