

*   Load input data.
*   Preprocess data and save filtered passages.
*   Generate questions for filtered passages.
*   Correct the output JSON format.
*   Send a notification upon completion.


In [None]:
import json
import os
import re
import time
import uuid
from openai import AzureOpenAI
import smtplib
from email.message import EmailMessage

# Initialize the environment
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["AZURE_OPENAI_ENDPOINT"] = "<your_azure_endpoint>"
os.environ["AZURE_OPENAI_KEY"] = "<your_azure_api_key>"
os.environ["OPENAI_API_VERSION"] = "2024-02-15-preview"

client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version=os.getenv("OPENAI_API_VERSION")
)

print("Azure OpenAI client initialized successfully.")

def is_obligation(text):
    phrases = [
        "Where a Chapter, Part or Section of these Rules applies to",
        "These Rules apply to every",
        "For the purposes of these Rules",
        "This chapter applies to",
        "These Rules do not apply to",
        "The Rules in this Rulebook",
    ]
    if len(text.split()) < 7:
        print(f"Text rejected for being too short: {text}")
        return False
    if any(phrase in text for phrase in phrases):
        print(f"Text rejected for containing non-obligation phrase: {text}")
        return False
    return True

def preprocess_data_and_save(data, output_file_path):
    start_time = time.time()
    filtered_data = [item for item in data if is_obligation(item.get('Passage', '').strip())]
    with open(output_file_path, 'w') as file:
        json.dump(filtered_data, file, indent=4)
    print(f"Filtered obligations saved to {output_file_path}. Total items: {len(filtered_data)}. Time taken: {time.time() - start_time:.2f} seconds")

def generate_question(context):
    questions = []
    try:
        messages = [
            {"role": "system", "content": "Your task is to generate realistic and applied questions that pertain to the provided regulatory or compliance material. Ensure that the context implicitly contains the answer to the question."},
            {"role": "user", "content": context}
        ]
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="gpt-4-turbo-1106",
            temperature=0.7,
            max_tokens=800,
        )
        question_list = chat_completion.choices[0].message.content.strip().split('\n')
        questions = [re.sub(r'^\d+\.\s+', '', q).strip() for q in question_list if q.strip().endswith('?')]
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(20)
    return questions

def process_data_and_generate_questions(input_file_path, output_file_path):
    start_time = time.time()
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    print(f"Processing {len(data)} items...")
    for index, item in enumerate(data):
        print(f"Processing item {index + 1}/{len(data)}...")
        questions = generate_question(item['Passage'].strip())
        with open(output_file_path, 'a') as file:
            for question in questions:
                item_with_question = item.copy()
                item_with_question['QuestionID'] = str(uuid.uuid4())
                item_with_question['Question'] = question
                json.dump(item_with_question, file)
                file.write("\n")
    print(f"Questions saved to {output_file_path}. Total time: {time.time() - start_time:.2f} seconds")

def correct_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
    except json.JSONDecodeError:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.read().strip().split('\n')
        corrected_json = json.dumps([json.loads(line) for line in lines], indent=4)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(corrected_json)
        print(f"Corrected and saved file: {file_path}")

def send_email(subject, message):
    sender = "Your Name <your_email@example.com>"
    receiver = "Recipient Name <recipient_email@example.com>"
    email = EmailMessage()
    email.set_content(message)
    email['Subject'] = subject
    email['From'] = sender
    email['To'] = receiver
    try:
        with smtplib.SMTP("<your_smtp_server>", 587) as server:
            server.starttls()
            server.login("<your_smtp_user>", "<your_smtp_password>")
            server.send_message(email)
        print("Email sent successfully!")
    except Exception as e:
        print(f"Failed to send email: {e}")

def main():
    filenames = ["Document1", "Document2"]  # Replace with actual filenames
    for filename in filenames:
        input_file_path = f"InputFolder/{filename}.json"
        filtered_file_path = f"FilteredFolder/{filename}_filtered.json"
        output_file_path = f"OutputFolder/{filename}_questions.json"
        with open(input_file_path, 'r') as file:
            data = json.load(file)
        preprocess_data_and_save(data, filtered_file_path)
        process_data_and_generate_questions(filtered_file_path, output_file_path)
        correct_json_file(output_file_path)
        send_email("Processing Complete", f"File {filename} has been processed successfully.")

if __name__ == "__main__":
    main()
