In [None]:
import json
import random
import re
import concurrent.futures
from rich.progress import Progress

import evaluations

from vllm_gen import get_completion
from vllm_gen import get_completion_text
from vllm_gen import token_count

# Grammar
grammar = """start ::= question "Closed-ended question:" question "Semi-Structured question:" question "Leading question:" question "\n\nInstructions (Imperatives)\n\nShort instruction:" question "Scenario-based instruction:" question "Problem-based instruction:" question "\n\nPrompts\n\nShort prompt:" question "Scenario-based prompt:" question "Problem-based prompt:" question "\n\nRequests (Modal Constructions)\n\nFormal request:" question "Informal request:" question "Polite request:" question "Direct request:" question "\n\nCategory: Detailed\n\n\n" questionsubgroup "\n\nCategory: Not directly related\n\n\n" questionsubgroup

question ::= " " line "\n"

questionsubgroup ::= "Questions (Interrogatives)\n\nOpen-ended question:" question "Closed-ended question:" question "Semi-Structured question:" question "Leading question:" question "\n\nInstructions (Imperatives)\n\nShort instruction:" question "Scenario-based instruction:" question "Problem-based instruction:" question "\n\nPrompts\n\nShort prompt:" question "Scenario-based prompt:" question "Problem-based prompt:" question "\n\nRequests (Modal Constructions)\n\nFormal request:" question "Informal request:" question "Polite request:" question "Direct request:" question

line ::= [^\r\n\x0b\x0c\x85\u2028\u2029|\"0-9][^\r\n\x0b\x0c\x85\u2028\u2029|]+
"""

def construct_regex():
    # Construct the regex
    
    # line
    line = "[^\r\n\x0b\x0c\x85\u2028\u2029|\"0-9][^\r\n\x0b\x0c\x85\u2028\u2029|]+"

    # question
    question = " " + line + "\n"

    # questionsubgroup
    questionsubgroup = "Questions (Interrogatives)\n\nOpen-ended question:" + question + "Closed-ended question:" + question + "Semi-Structured question:" + question + "Leading question:" + question + "\n\nInstructions (Imperatives)\n\nShort instruction:" + question + "Scenario-based instruction:" + question + "Problem-based instruction:" + question + "\n\nPrompts\n\nShort prompt:" + question + "Scenario-based prompt:" + question + "Problem-based prompt:" + question + "\n\nRequests (Modal Constructions)\n\nFormal request:" + question + "Informal request:" + question + "Polite request:" + question + "Direct request:" + question

    # initial group
    initial_group = question + "Closed-ended question:" + question + "Semi-Structured question:" + question + "Leading question:" + question + "\n\nInstructions (Imperatives)\n\nShort instruction:" + question + "Scenario-based instruction:" + question + "Problem-based instruction:" + question + "\n\nPrompts\n\nShort prompt:" + question + "Scenario-based prompt:" + question + "Problem-based prompt:" + question + "\n\nRequests (Modal Constructions)\n\nFormal request:" + question + "Informal request:" + question + "Polite request:" + question + "Direct request:" + question + "\n\nCategory: Detailed\n\n\n" + questionsubgroup + "\n\nCategory: Not directly related\n\n\n" + questionsubgroup + "\n\nFinal comments:"

    # return the regex
    return initial_group

# Construct the regex
regex = construct_regex()

prompt_file = "./generation_prompts/q-gen-llama3.txt"

with open(prompt_file, "r") as f:
    prompt = f.read()

question_seed_file = "./prompt_resources/question_seed.txt"

with open(question_seed_file, "r") as f:
    question_seed = f.read()

# Seperate the seed into lines
question_seed_lines = question_seed.split("\n")

# Seperate the lines ending with a question mark from the lines that do not
question_seed_question_lines = []
question_seed_non_question_lines = []

for line in question_seed_lines:
    if line.endswith("?"):
        question_seed_question_lines.append(line)
    else:
        question_seed_non_question_lines.append(line)

# Shuffle the question lines
random.shuffle(question_seed_question_lines)

# Shuffle the non-question lines
random.shuffle(question_seed_non_question_lines)

# Take the first 10 lines from each
question_seed_question_lines = question_seed_question_lines[:10]
question_seed_non_question_lines = question_seed_non_question_lines[:10]

# Combine the lines
question_seed_lines = question_seed_question_lines + question_seed_non_question_lines

# Join the lines
question_seed = "\n".join(question_seed_lines)

# Replace {{QUESTION_SEED}} with the text
prompt = re.sub(r"{{QUESTION_SEED}}", question_seed, prompt)

summary = "Aimed at BMW 3 and 5 Series owners, this comprehensive guide covers routine maintenance and repair from 1983 to 1991, including oil changes, fluid checks, and troubleshooting electrical problems. It also provides instructions for removing and refitting various parts, suspension and steering system components, and engine overhaul procedures. Additionally, it covers mandatory vehicle inspection checks for MOT, essential tools, common vehicle issues, and fault finding guides for brake and suspension issues. The guide also includes a glossary of technical terms and safety guidelines. Models covered include 316, 316i, 318i, 320i, 325i, Touring and Convertible, 518, 518i, 525i, 528i, 535i, and M535i."

# Replace {{DOCUMENT_SUMMARY}} with the summary
prompt = re.sub(r"{{DOCUMENT_SUMMARY}}", summary, prompt)

text_file = "./q-gen-test-doc.txt"

with open(text_file, "r") as f:
    document = f.read()

# Replace {{DOCUMENT}} with the text
prompt = re.sub(r"{{DOCUMENT}}", document, prompt)

# Save the prompt to a file
prompt_file = "./prompt_resources/q-gen-prompt.txt"

with open(prompt_file, "w") as f:
    f.write(prompt)

# Count the tokens in the prompt
# prompt_tokens = token_count(prompt)

# print(f"Prompt tokens: {prompt_tokens}")

output = get_completion_text(prompt, regex=regex, max_tokens=2000, temperature=1.0, min_p=0.1)

print(output)

# Split into lines
lines = output.split("\n")

# Add each line that begines with "Request:" or "Reversed form:" to a list in addition to the first line
questions = []

# Add the first line
questions.append(lines[0])

# Strip the first line of leading and trailing whitespace
questions[0] = questions[0].strip()

# List of prefixes to look for
prefixes = ["Open-ended question:", "Closed-ended question:", "Semi-Structured question:", "Leading question:", "Short instruction:", "Scenario-based instruction:", "Problem-based instruction:", "Short prompt:", "Scenario-based prompt:", "Problem-based prompt:", "Formal request:", "Informal request:", "Polite request:", "Direct request:"]

for line in lines:
    # Check if the line starts with a prefix
    if any([line.startswith(prefix) for prefix in prefixes]):
        # Strip the prefix
        line = line.split(":")[1]

        # Strip the line
        line = line.strip()

        questions.append(line)

# evaluation.pass_test_comprehensive(question) returns a list of evaluations for the question and an array of the evaluations performed
# For each question check it with evaluations.pass_test_comprehensive(question) and create a json object with the questions and the returned array of evaluations and print it
def process_question(question):
    passed, evaluations_performed = evaluations.pass_test_comprehensive(question)
    return {
        "question": question,
        "passed": passed,
        "evaluations_performed": evaluations_performed
    }

questions_evaluations = []

# Use ThreadPoolExecutor to process 4 questions at a time
with Progress() as progress:
    # Create a task for the progress bar
    task = progress.add_task("[green]Processing questions...", total=len(questions))
    
    # Use ThreadPoolExecutor to process 16 questions at a time
    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
        # Map and process questions, update the progress bar upon each completion
        for question_evaluation in executor.map(process_question, questions):
            questions_evaluations.append(question_evaluation)
            progress.update(task, advance=1)  # Advance the progress bar


print(json.dumps(questions_evaluations, indent=4))