In [6]:
import openai
import json
import time
import os
import logging

logging.basicConfig(level=logging.INFO)
from dotenv import load_dotenv

# Load API Key from .env file
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

# Configure OpenAI API client
openai.api_key = api_key

In [7]:
with open('Samples/sample_siirtokarjalaiset_NOT_annotated.json', 'r') as f:
    data = json.load(f)

output_file_path = 'apiResponse/all_responses_200_sample.json'

In [8]:

# Ensure the apiResponse directory exists
if not os.path.exists('apiResponse'):
    os.makedirs('apiResponse')

# Example of batching

max_retries = 5
batch_size = 3
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    data_str = json.dumps(batch)

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": (
            "I need you to scrape data from this text. List only name, index number, hobbies and social organisations. "
            "Notice to list spouse's hobbies and social orgs separately. Do not list jobs, or war time occupations. "
            "Do not suggest to make an algorithm. If no social orgs detected respond: - "
            "Do not say anything but the asked information. "
            "Response in format: "
            "--"
            "PersonID: "
            "PersonName: "
            "PersonHobbies: "
            "PersonSocialOrgs: "
            "SpouseID:"
            "SpouseName:"
            "SpouseHobbies: "
            "SpouseSocialOrgs: "
            + data_str
        )}
    ]

    # API call
    for attempt in range(max_retries):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=messages,
                temperature=0.8,
                timeout=30  # timeout in seconds
            )

            # Ensure the 'apiResponse/raw_api_responses' directory exists
            if not os.path.exists('apiResponse/raw_api_responses'):
                os.makedirs('apiResponse/raw_api_responses')

            # Save raw response
            raw_response_path = f'apiResponse/raw_api_responses/raw_response_{i//batch_size}.json'
            with open(raw_response_path, 'w', encoding='utf-8') as raw_file:
                json.dump(response, raw_file, ensure_ascii=False, indent=4)

            # Splitting on '--' to separate individual responses
            responses = response['choices'][0]['message']['content'].split('--')[1:] 
            
            # Storing responses along with batch and individual indexes
            structured_responses = []
            for j, api_response in enumerate(responses):
                structured_responses.append({
                    "batch_number": i//batch_size,
                    "person_index": i + j,
                    "api_response": api_response.strip()  # Removing leading/trailing whitespaces
                })

            output_file_path = 'apiResponse/all_responses_200_sample.json'

            # Save/Append the batch response to the file
            with open(output_file_path, 'a', encoding='utf-8') as file:
                for item in structured_responses:
                    json.dump(item, file, ensure_ascii=False)
                    file.write('\n')

            # Exit retry loop upon successful API call
            break
                    
        except openai.error.OpenAIError as e:
            logging.error(f"OpenAI API error in batch starting at index {i}: {str(e)}")
            if "Rate limit exceeded" in str(e):
                logging.info("Rate limit exceeded, waiting for 60 seconds...")
                time.sleep(60)  # Waiting for a longer time if rate limit is exceeded
            else:
                logging.info("Encountered an error, waiting for 10 seconds...")
                time.sleep(10)

        # Only sleep between API calls if retrying
        time.sleep(2)



KeyboardInterrupt: 

In [42]:
# Convert JSON Lines to standard JSON array format
#For eval script use this format 
with open(output_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    
with open('apiResponse/all_responses_200_sample.json', 'w', encoding='utf-8') as file:
    file.write('[' + ','.join(lines) + ']')


In [46]:
#This is formatting the raw api response into more human readable json
with open('apiResponse/all_responses.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

def format_api_response(data):
    formatted_data = []
    for item in data:
        new_item = item.copy()
        response = item["api_response"].split("\n")
        formatted_response = {}
        for line in response:
            # Handle key and value being in the same part (no space after ":")
            if ":" in line:
                key, value = line.split(":", 1)
                key = key.strip()
                value = value.strip()
                if "," in value:
                    formatted_response[key] = [v.strip() for v in value.split(",")]
                else:
                    formatted_response[key] = value
            else:
                formatted_response[line] = []
        new_item["api_response"] = formatted_response
        formatted_data.append(new_item)
    return formatted_data

formatted_data = format_api_response(data)
formatted_json = json.dumps(formatted_data, ensure_ascii=False, indent=4)

# Example: writing formatted JSON back to a file
with open('apiResponse/formatted_responses.json', 'w', encoding='utf-8') as file:
    file.write(formatted_json)


In [None]:
##TESTS

In [32]:
## index testing

import json

def test_ascending_indexes_and_batches(file_path):
    # Load data from JSON file
    with open(file_path, 'r') as f:
        data = json.load(f)

    expected_batch = 0
    expected_index = 0  # expecting the first index to be 0
    persons_in_current_batch = 0  # counter for number of persons in the current batch
    
    discrepancies = []  # List to hold messages about discrepancies found

    for item in data:
        current_batch = item['batch_number']
        current_index = item['person_index']

        # Check if current batch and index are as expected
        if current_batch != expected_batch or current_index != expected_index:
            discrepancies.append(f"Unexpected batch/index found: ({current_batch}, {current_index}), expected: ({expected_batch}, {expected_index})")
        
        # Increment expected index and persons in batch counters
        expected_index += 1
        persons_in_current_batch += 1
        
        # If three persons in the current batch have been processed,
        # increment expected batch number and reset persons counter
        if persons_in_current_batch == 3:
            expected_batch += 1
            persons_in_current_batch = 0
    
    # If discrepancies were found, return them, otherwise return a success message
    if discrepancies:
        return False, discrepancies
    else:
        return True, "Indexes and batches are in order"

# Example usage:
file_path = 'apiResponse/wrong_index_test.json'
is_valid, message = test_ascending_indexes_and_batches(file_path)

# If discrepancies were found, print them all
if not is_valid:
    print("Discrepancies found:")
    for msg in message:
        print(msg)
else:
    print(message)


Discrepancies found:
Unexpected batch/index found: (4, 1), expected: (0, 1)
Unexpected batch/index found: (2, 5), expected: (2, 7)
Unexpected batch/index found: (5, 8), expected: (2, 8)
Unexpected batch/index found: (3, 153), expected: (3, 10)
Unexpected batch/index found: (4, 23), expected: (4, 13)


In [38]:
##Right format test
import json

# List of keywords to check in 'api_response'
keywords = [
    "PersonID", 
    "PersonName", 
    "PersonHobbies", 
    "PersonSocialOrgs", 
    "SpouseID", 
    "SpouseName", 
    "SpouseHobbies", 
    "SpouseSocialOrgs"
]

# Read data from JSON file
with open('apiResponse/wrong_elements_test.json', "r") as file:
    persons = json.load(file)

# Check each person's 'api_response' for keywords
for person in persons:
    api_response = person["api_response"]
    missing_keywords = []
    
    # Check for each keyword
    for keyword in keywords:
        if keyword not in api_response:
            missing_keywords.append(keyword)
    
    # Report error if any keyword is missing
    if missing_keywords:
        print(f"Error in index {person['person_index']}: Missing keywords: {', '.join(missing_keywords)}")


Error in index 2: Missing keywords: PersonName
Error in index 5: Missing keywords: PersonSocialOrgs
Error in index 9: Missing keywords: PersonID
Error in index 12: Missing keywords: PersonSocialOrgs
Error in index 13: Missing keywords: PersonID
