In [153]:
#Imports and API Key and constants
import openai
import json
import time
import os
import logging

logging.basicConfig(level=logging.INFO)
from dotenv import load_dotenv

# Load API Key from .env file
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

# Configure OpenAI API client
openai.api_key = api_key


# Constants
max_retries = 5
batch_size = 3
SKIP_FIRST_PERSON = False
output_file_path = 'apiResponse/all_responses_200_sample.json'

# Load data and run the main function
with open('data/sample_siirtokarjalaiset.json', 'r') as f:
    data = json.load(f)




In [154]:
prompt_prefix = """I need you to scrape data from this text. 
            These are interview's from Karelian people written in finnish. 
            I will give you a batch of three stories.
            List me names, IDs, hobbies and social organisations.
            Notice to list spouse's hobbies and social orgs separately. 
            Keep in mind that husband is usually listed first in the story. Even when he is not primary person. 
            Do not list jobs, or war time occupations.
            Do not suggest to make an algorithm. Do not list the sourcetext.
            Do not answer anything but the asked information. 
            Always response in the following format for each story: 

            --
            
            PersonName: 
            PersonID:
            PersonHobbies: 
            PersonSocialOrgs: 
        
            SpouseName:
            SpouseID:
            SpouseHobbies:
            SpouseSocialOrgs:
            """


In [155]:
SYSTEM_MESSAGE = {"role": "system", "content": "You are a helpful assistant."}

def create_user_message(data_str):
    return {"role": "user", "content": prompt_prefix + data_str}

In [156]:
def make_api_call(batch_num, messages):
    for attempt in range(max_retries):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=messages,
                temperature=0.8,
                timeout=30
            )

            raw_response_path = f'apiResponse/raw_api_responses/raw_response_{batch_num}.json'
            with open(raw_response_path, 'w', encoding='utf-8') as raw_file:
                json.dump(response, raw_file, ensure_ascii=False, indent=4)

            return response

        except openai.error.OpenAIError as e:
            logging.error(f"OpenAI API error in batch {batch_num}: {str(e)}")
            if "Rate limit exceeded" in str(e):
                logging.info("Rate limit exceeded, waiting for 60 seconds...")
                time.sleep(60)
            else:
                logging.info("Encountered an error, waiting for 10 seconds...")
                time.sleep(10)
            time.sleep(2)
    return None

In [157]:
#REGEX and processing functions
import re

def extract_person_blocks(response_content):
    person_pattern = r'PersonName'
    person_starts = [match.start() for match in re.finditer(person_pattern, response_content)]
    person_blocks = []
    for i, start in enumerate(person_starts):
        end = person_starts[i + 1] if i + 1 < len(person_starts) else None
        person_blocks.append(response_content[start:end].strip())
    return person_blocks

def extract_person_data(person_block, expected_keys):
    extracted_data = {}
    for key in expected_keys:
        pattern = key + r': ([^\n]*)'
        match = re.search(pattern, person_block)
        if match:
            extracted_data[key] = match.group(1).strip()
        else:
            extracted_data[key] = ""  # Assigning an empty string for missing data
    return extracted_data



In [158]:
#Formatting structured response from api data
def prepare_structured_response(batch_num, start_index, person_blocks, expected_keys, output_file_path):
    if SKIP_FIRST_PERSON:
        person_blocks = person_blocks[1:]


    all_responses = []

    for j, person_block in enumerate(person_blocks, start=1):
        person_data = extract_person_data(person_block, expected_keys)

        structured_data = []
        for key, value in person_data.items():
            structured_data.append(f"{key}: {value}")
        api_response_string = "\n".join(structured_data)

        structured_response = {
            "batch_number": batch_num,
            "person_index": start_index + j,
            "api_response": api_response_string
        }

        with open(output_file_path, 'a', encoding='utf-8') as file:
            json.dump(structured_response, file, ensure_ascii=False)
            file.write('\n')

        all_responses.append(structured_response)

    return all_responses


In [159]:
#prepare batches
def prepare_batch_data(batch_num, data, batch_size):
    i = (batch_num - 1) * batch_size  # Start index for this batch

    if len(data) - i < batch_size:
        batch = data[i:]
    else:
        batch = data[i:i + batch_size]

    batch_content = []
    batch_indexes = []

    for person in batch:
        if isinstance(person, dict) and "index" in person:
            batch_content.append('Person : \n' + json.dumps(person))
            batch_indexes.append(str(person["index"]))
        else:
            batch_content.append('Person data missing or malformed : \n' + json.dumps(person))

    return '\n'.join(batch_content), batch_indexes


In [160]:
#MAIN API ALGORITHM

# Ensure directories exist
if not os.path.exists('apiResponse'):
    os.makedirs('apiResponse')
if not os.path.exists('apiRequests'):
    os.makedirs('apiRequests')
if not os.path.exists('apiResponse/raw_api_responses'):
    os.makedirs('apiResponse/raw_api_responses')


def run_batches(batch_numbers, data, batch_size, output_file_path):
    print(f"Entering run_batches with batches: {batch_numbers}")
    all_responses = []

    for batch_num in batch_numbers:
        data_str, batch_indexes = prepare_batch_data(batch_num, data, batch_size)

        messages = [
            SYSTEM_MESSAGE,
            create_user_message(data_str)
        ]

        log_message = f"Making API call for batch number {batch_num} with indexes: {', '.join(batch_indexes)}\nData:\n{data_str}\n"
        with open('apiRequests/api_requests_log.txt', 'a', encoding='utf-8') as log_file:
            log_file.write(log_message)

        response = make_api_call(batch_num, messages)
        if response:
            person_blocks = extract_person_blocks(response['choices'][0]['message']['content'])  
            expected_keys = ["PersonID", "PersonName", "PersonHobbies", "PersonSocialOrgs",
                             "SpouseID", "SpouseName", "SpouseHobbies", "SpouseSocialOrgs"]
            batch_responses = prepare_structured_response(batch_num, (batch_num - 1) * batch_size, person_blocks, expected_keys, output_file_path)
            all_responses.extend(batch_responses)

        time.sleep(0.5)

    return all_responses


def run_all_batches(data, batch_size):
    total_batches = len(data) // batch_size + (1 if len(data) % batch_size else 0)
    return run_batches(range(1, total_batches + 1), data, batch_size, output_file_path)


In [161]:
#!!Running the api algorithm !!

responses = run_all_batches(data, batch_size)

Entering run_batches with batches: range(1, 68)


KeyboardInterrupt: 

In [112]:
#API CALL OLD VERSION

with open('data/sample_siirtokarjalaiset.json', 'r') as f:
    data = json.load(f)
    
output_file_path = 'apiResponse/all_responses_200_sample.json'

# Ensure the apiResponse directory exists
if not os.path.exists('apiResponse'): 
    os.makedirs('apiResponse')

max_retries = 5
batch_size = 3

def run_batches(batch_numbers, data, batch_size, output_file_path):
    print(f"Entering run_batches with batches: {batch_numbers}")
    all_responses = []
    

    for batch_num in batch_numbers:
        i = (batch_num - 1) * batch_size  # Start index for this batch
        
        # Check if this is the last batch and the remaining data is less than the batch size
        if len(data) - i < batch_size:
            batch = data[i:]
        else:
            batch = data[i:i + batch_size]

        batch_content = []
        batch_indexes = []

        for person in batch:
            if isinstance(person, dict) and "index" in person:
                batch_content.append('Person : \n' + json.dumps(person))
                batch_indexes.append(str(person["index"]))
            else:
                # Handle or log the inconsistency here
                batch_content.append('Person data missing or malformed : \n' + json.dumps(person))

        data_str = '\n'.join(batch_content)

        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt_prefix + data_str}
        ]

        # Create a message string to be logged       
        log_message = f"Making API call for batch number {batch_num} with indexes: {', '.join(batch_indexes)}\nData:\n{data_str}\n"


        # Ensure the 'apiRequests' directory exists
        if not os.path.exists('apiRequests'):
            os.makedirs('apiRequests')

        # Save the message to 'apiRequests/api_requests_log.txt'
        with open('apiRequests/api_requests_log.txt', 'a', encoding='utf-8') as log_file:
            log_file.write(log_message)

        # API call
        for attempt in range(max_retries):
            print(attempt)
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                    temperature=0.8,
                    timeout=30  # timeout in seconds
                )

                # Ensure the 'apiResponse/raw_api_responses' directory exists
                if not os.path.exists('apiResponse/raw_api_responses'):
                    os.makedirs('apiResponse/raw_api_responses')

                # Save raw response
                raw_response_path = f'apiResponse/raw_api_responses/raw_response_{batch_num}.json'
                with open(raw_response_path, 'w', encoding='utf-8') as raw_file:
                    json.dump(response, raw_file, ensure_ascii=False, indent=4)

                # Extracting blocks of persons from the API response
                person_blocks = extract_person_blocks(response['choices'][0]['message']['content'])
                
                # Define the keys we want to extract
                expected_keys = ["PersonID", "PersonName", "PersonHobbies", "PersonSocialOrgs", 
                                "SpouseID", "SpouseName", "SpouseHobbies", "SpouseSocialOrgs"]

                # Extracting data for each block
                for j, person_block in enumerate(person_blocks, start=1):  #For skipping the first person block insert person_blocks[1:]
                    person_data = extract_person_data(person_block, expected_keys)
                    
                    # Create the string representation for api_response
                    structured_data = []
                    for key, value in person_data.items():
                        structured_data.append(f"{key}: {value}")
                    api_response_string = "\n".join(structured_data)
                    
                    structured_response = {
                        "batch_number": batch_num,
                        "person_index": i + j,
                        "api_response": api_response_string  # Modified to be a single string
                    }
                    all_responses.append(structured_response)
                    
                    # Save structured response immediately

                    with open(output_file_path, 'a', encoding='utf-8') as file:
                        json.dump(structured_response, file, ensure_ascii=False)
                        file.write('\n')

                break  # Exit retry loop upon successful API call


            except openai.error.OpenAIError as e:
                logging.error(f"OpenAI API error in batch {batch_num}: {str(e)}")
                if "Rate limit exceeded" in str(e):
                    logging.info("Rate limit exceeded, waiting for 60 seconds...")
                    time.sleep(60)  # Waiting for a longer time if rate limit is exceeded
                else:
                    logging.info("Encountered an error, waiting for 10 seconds...")
                    time.sleep(10)
            #sleep more between retried API calls
            time.sleep(2)
        # sleep between API calls
        time.sleep(0.5)
            
    return all_responses

# The main command to run the API for all batches
def run_all_batches(data, batch_size, output_file_path='apiResponse/all_responses_200_sample.json'):
    total_batches = len(data) // batch_size + (1 if len(data) % batch_size else 0)
    all_responses = run_batches(range(total_batches), data, batch_size, output_file_path)
    return all_responses

In [162]:
# Convert JSON Lines to standard JSON array format
#For eval script use this format 
with open(output_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    
with open('apiResponse/all_responses_200_sample.json', 'w', encoding='utf-8') as file:
    file.write('[' + ','.join(lines) + ']')


In [9]:
#This is formatting the raw api response into more human readable json
with open('apiResponse/all_responses_200_sample.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

def format_api_response(data):
    formatted_data = []
    for item in data:
        new_item = item.copy()
        response = item["api_response"].split("\n")
        formatted_response = {}
        for line in response:
            # Handle key and value being in the same part (no space after ":")
            if ":" in line:
                key, value = line.split(":", 1)
                key = key.strip()
                value = value.strip()
                if "," in value:
                    formatted_response[key] = [v.strip() for v in value.split(",")]
                else:
                    formatted_response[key] = value
            else:
                formatted_response[line] = []
        new_item["api_response"] = formatted_response
        formatted_data.append(new_item)
    return formatted_data

formatted_data = format_api_response(data)
formatted_json = json.dumps(formatted_data, ensure_ascii=False, indent=4)

# Example: writing formatted JSON back to a file
with open('apiResponse/formatted_responses.json', 'w', encoding='utf-8') as file:
    file.write(formatted_json)


In [163]:
#index and batch testing 

failed_batches = []

def test_batches_and_indexes(file_path, batch_size=3, expected_batches=68):
    global failed_batches

    with open(file_path, 'r') as f:
        data = json.load(f)

    errors = []

    # Initialize expected values
    expected_batch = 1
    expected_index = 1

    for item in data:
        current_batch = item['batch_number']
        current_index = item['person_index']

        if current_batch > expected_batch:
            errors.append(f"Expected batch {expected_batch}, found {current_batch}.")
            if expected_batch not in failed_batches:
                failed_batches.append(expected_batch)
            expected_batch = current_batch
            expected_index = current_index + 1

        elif current_index > expected_index:
            errors.append(f"Expected index {expected_index}, found {current_index} in batch {expected_batch}.")
            if expected_batch not in failed_batches:
                failed_batches.append(expected_batch)
            expected_index = current_index + 1

        else:
            expected_index += 1

        if (expected_index - 1) % batch_size == 0:
            expected_batch += 1
            expected_index = expected_batch * batch_size + 1

    # Check for missing batches
    while expected_batch <= expected_batches:
        if expected_batch not in failed_batches:
            failed_batches.append(expected_batch)
        expected_batch += 1

    for error in errors:
        print(error)

    return failed_batches

file_path = 'apiResponse/all_responses_200_sample.json'
failed_batches_from_index_test = test_batches_and_indexes(file_path, expected_batches=67)
print("Failed Batches from Index Test:", failed_batches_from_index_test)


Failed Batches from Index Test: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]


In [164]:
#Testing to find broken format
def test_format(file_path):
    failed_batches = []

    with open(file_path, 'r') as f:
        data = json.load(f)

    # List of keywords to check in 'api_response'
    keywords = [
        "PersonName", 
        "PersonHobbies", 
        "PersonSocialOrgs", 
        "SpouseName", 
        "SpouseHobbies", 
        "SpouseSocialOrgs"
    ]

    for person in data:
        api_response = person["api_response"]
        current_batch = person['batch_number']

        # Check if any keyword is in the response
        if not any(keyword in api_response for keyword in keywords):
            if current_batch not in failed_batches:
                failed_batches.append(current_batch)
                print(f"Error: None of the keywords found for index {person['person_index']} in batch {current_batch}")

    return failed_batches

file_path = "apiResponse/all_responses_200_sample.json"  # Replace with your actual file path
failed_batches_from_format_test = test_format(file_path)
print("Failed Batches from Format Test:", failed_batches_from_format_test)


Failed Batches from Format Test: []


In [165]:
#testing to find missing entries // Optional
def test_format2(file_path):
    global failed_batches

    with open(file_path, 'r') as f:
        data = json.load(f)

    errors = []

    # List of keywords to check in 'api_response'
    keywords = [
        "PersonName", 
        "PersonHobbies", 
        "PersonSocialOrgs", 
        "SpouseName", 
        "SpouseHobbies", 
        "SpouseSocialOrgs"
    ]

    for person in data:
        api_response = person["api_response"]
        missing_keywords = []
        current_batch = person['batch_number']

        for keyword in keywords:
            if keyword not in api_response:
                missing_keywords.append(keyword)

        if missing_keywords:
            errors.append(f"Error in index {person['person_index']} in batch {current_batch}: Missing keywords: {', '.join(missing_keywords)}")
            if current_batch not in failed_batches:
                failed_batches.append(current_batch)

    for error in errors:
        print(error)
    return failed_batches

failed_batches_from_format_test2 = test_format2(file_path)
print("Failed Batches from Format Test:", failed_batches_from_format_test2)

Failed Batches from Format Test: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]


In [166]:
# Rerun failed batches
print(failed_batches_from_format_test2)
print(failed_batches_from_format_test)
reprocessed_responses = run_batches(failed_batches_from_index_test, data, batch_size, 'apiResponse/reprocessed_responses.json')

# If you want to save these reprocessed responses to a file, do so here
with open('apiResponse/reprocessed_responses.json', 'w') as f:
    json.dump(reprocessed_responses, f, indent=4)

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]
[]
Entering run_batches with batches: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]


KeyboardInterrupt: 

In [168]:
#Merge the rerun batches from reprocessed_responses.json

def merge_responses(original_file, reprocessed_file):
    # Load original and reprocessed data
    with open(original_file, 'r') as f:
        original_responses = json.load(f)
        
    with open(reprocessed_file, 'r') as f:
        reprocessed_responses = json.load(f)
        
    # Convert the original responses to a dictionary for easier searching and replacement
    original_dict = {resp['person_index']: resp for resp in original_responses}
    
    for repro_resp in reprocessed_responses:
        # Replace or insert the reprocessed response
        original_dict[repro_resp['person_index']] = repro_resp
        
    # Convert the dictionary back to a list, and sort by person_index
    updated_responses = sorted(original_dict.values(), key=lambda x: x['person_index'])
    
    # Write the updated list back to the original file
    with open(original_file, 'w') as f:
        json.dump(updated_responses, f, indent=4)

# Run the function
merge_responses('apiResponse/all_responses_200_sample.json', 'apiResponse/reprocessed_responses.json')


In [None]:
#use incase the merge_responses is interrupted
# Convert JSON Lines to standard JSON array format
#For eval script use this format 
with open(output_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    
with open('apiResponse/reprocessed_responses.json', 'w', encoding='utf-8') as file:
    file.write('[' + ','.join(lines) + ']')


In [173]:
failed_batches = []
test_batches_and_indexes(file_path, batch_size=3)
failed_batches_from_index_test = test_format(file_path)
failed_batches_from_format_test = test_format(file_path)
print("Failed Batches from Format Test:")
print("Failed Batches from index test:")
print(failed_batches)

Failed Batches from Format Test:
Failed Batches from index test
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68]
