In [None]:
!pip install openai

In [None]:
import json
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import re

In [None]:
from openai import OpenAI
client = OpenAI(api_key="your_api_key")

# GPT Batch Generation

In [None]:
df = pd.read_json("sensitivity.json")

In [None]:
df['id'] = 'FOLIO_train_' + df.index.astype(str)

In [None]:
# Few-shot prompt used to generate samples
fs_prompt = """Given a natural language sentence, your task is to convert the sentence into first-order logic statements using the following operators '∧','∨','¬','→','↔','∀','∃','=','⊕'. The output is a single first-order statement representing the sentence with no additional tasks. Generate 3 different samples of output.
Text: [[TEXT]]
Output: """

In [None]:
# Function to create JSONL file for Chat Completions API
def create_jsonl_for_chat(df, output_file):
    with open(output_file, 'w') as file:
        for _, row in df.iterrows():
            context = row['premisesNL'].strip()
            full_prompt = fs_prompt.replace("[[TEXT]]", context)
            json_line = {
                "custom_id": f"request-{row['id']}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o",
                    "messages": [{"role": "user", "content": full_prompt}],
                    "max_tokens": 256
                }
            }
            file.write(json.dumps(json_line) + '\n')

# Usage
create_jsonl_for_chat(df, 'sensitivity.jsonl')


In [None]:
batch_input_file = client.files.create(
  file=open("sensitivity.jsonl", "rb"),
  purpose="batch"
)
batch_input_file_id = batch_input_file.id

In [None]:
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "FOL generations with 3 generations at a time"
    }
)

In [None]:
client.batches.retrieve("batch_id")

In [None]:
content = client.files.content("output_file")

In [None]:
content_bytes = content.read()
with open("output.jsonl", "wb") as file:
    file.write(content_bytes)

In [None]:
gpt_op = {}
with open('output.jsonl', 'r') as file:
  # Parse each line as JSON
  for line in file:
      data = json.loads(line)
      # Now 'data' contains the parsed JSON object for each line
      gpt_op[data['custom_id']] = data['response']['body']['choices'][0]['message']['content']

In [None]:
df_logic = pd.DataFrame(list(gpt_op.items()), columns=['id', 'logic_program'])

In [None]:
df_logic['id'] = df_logic['id'].apply(lambda x: x.split('-')[-1].strip())

In [None]:
df_gpt = pd.merge(df, df_logic, on='id')

In [None]:
def split_logic(fol):
  patterns = ["1. ", "2. ", "3. "]
  # Split the text by the patterns
  parts = []
  for i, pattern in enumerate(patterns):
      # Find the start index of the current part
      start = fol.find(pattern)
      if i < len(patterns) - 1:
          # Find the start index of the next part
          end = fol.find(patterns[i + 1])
          parts.append(fol[start + len(pattern):end].strip())
      else:
          # Last part goes until the end of the text
          parts.append(fol[start + len(pattern):].strip())
  return parts

In [None]:
df_gpt['sample1'] = df_gpt['logic_program'].apply(lambda x: x.split('2.')[0].split('1. ')[-1].strip())
df_gpt['sample2'] = df_gpt['logic_program'].apply(lambda x: x.split('3.')[0].split('2. ')[-1].strip())
df_gpt['sample3'] = df_gpt['logic_program'].apply(lambda x: x.split('3.')[-1].strip())

In [None]:
df_gpt.to_json("gpt_fol_samples.json", orient="records",indent=4)