In [121]:
# we sample some context as prior for generation
import pandas as pd
k = 50
df = pd.read_csv("../train_hard.csv")
df = df[['sentence']]
df['length'] = df['sentence'].str.len()
df_sorted = df.sort_values(by='length', ascending=False)
df_sorted = df_sorted[df_sorted['length'] < 600] # set a max length
df_k = df_sorted.head(k)['sentence']
df_k.to_csv(f'top_{k}.csv', index=False)

In [122]:
# prompts
generation_prompt = '''You are an expert economist tasked with generating a concise and insightful paragraph demonstrating one of three key economic concepts based on a given economic context.
Your output must implicitly explain one of the following economic concepts without explicitly naming it.
Your output should contain your generated paragraph without anything else.

# Concepts:
1. Thinking at the Margin: The concept of making decisions by evaluating the additional (marginal) benefits and costs of small changes.
2. Counterfactual: The idea of constructing hypothetical scenarios to analyze what could have happened under different conditions, often used for causal analysis.
3. General Equilibrium: A framework that examines the simultaneous interaction of multiple markets and agents in an economy, focusing on the interconnectedness of economic systems.

# Context:
[context]

# Instruction:
[instruction]

# Output:'''

instruction_prompt = "Generate a paragraph of [concept]."

check_prompt = '''You are an expert economist tasked with identifying which one of the three concepts the following paragraph implicitly explaining.
The definition of the concepts are explained as follow. You should output the name of the concept only without anything additional. If the paragraph does not related to any of the concepts, output None.

# Concepts:
1. Thinking at the Margin: The concept of making decisions by evaluating the additional (marginal) benefits and costs of small changes.
2. Counterfactual: The idea of constructing hypothetical scenarios to analyze what could have happened under different conditions, often used for causal analysis.
3. General Equilibrium: A framework that examines the simultaneous interaction of multiple markets and agents in an economy, focusing on the interconnectedness of economic systems.

# Paragraph:
[paragraph]

# Output:'''

def save_to_csv(output_dir, outputs):
    df = pd.read_csv(output_dir)
    for i, output_dict in enumerate(outputs):
        for key, value in output_dict.items():
            if key not in df.columns:
                df[key] = None
            df.at[i, key] = value
    df.to_csv(output_dir, index=False)


In [123]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from openai import OpenAI,AzureOpenAI

def inference(prompt, client, model):
    concepts = ['Thinking at the Margin', 'Counterfactual', 'General Equilibrium']
    output_dict = {}
    for concept in concepts: 
        try: 
            instruction = instruction_prompt.replace("[concept]", concept)
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt.replace('instruction', instruction)}], 
                max_tokens=512,
                temperature=0
            )
            output_dict[concept] = response.choices[0].message.content
        except Exception as e: 
            print("!!!!! Error when processing prompt !!!!!")
            print(prompt)
            print(concept)
            output_dict[concept] = None
    return output_dict

def batch_inference(batch_data, client, model, max_workers=5):
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = list(
            tqdm(
                executor.map(lambda x: inference(x, client, model), batch_data),
                total=len(batch_data)
            )
        )
        for future in futures:
            results.append(future)
    return results

In [124]:
# use appropriate dataset for inference
data_path = f"top_{k}.csv"

print("----- Load Dataset -----")
df = pd.read_csv(data_path)
batch_data = [generation_prompt.replace("[context]", context) for context in df['sentence']]

print("----- Inference Azure GPT-4O-Mini -----")
client = AzureOpenAI(api_key="b9135a15c242432cb20ddc43fea3a413", api_version="2023-06-01-preview", azure_endpoint="https://openai-oe.openai.azure.com/")
outputs = batch_inference(batch_data, client, "gpt-4o-mini", 4)
save_to_csv(data_path, outputs)

----- Load Dataset -----
----- Inference Azure GPT-4O-Mini -----


100%|██████████| 50/50 [01:11<00:00,  1.43s/it]


In [125]:
output_dataset = {
    "sentence": [], 
    "label": []
}
for i, output_dict in enumerate(outputs):
    for key, value in output_dict.items():
        output_dataset['sentence'].append(value)
        output_dataset['label'].append(key)

df = pd.DataFrame(output_dataset)
df.to_csv(f"econ-concepts-{k}.csv", index=False)