# Init

In [55]:
from openai import OpenAI
import os
from dotenv import load_dotenv
from utils import *
import re
import math

In [56]:
model = 'deepseek-reasoner'

In [57]:
load_dotenv()
client = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url="https://api.deepseek.com")

In [58]:
# models = client.models.list()
#
# for model in models:
#     print(model.id)

# Prompts

## Splitted text prompt

### Format

In [59]:
split_text_format = '''{
    question: str,
    textPart: str,
    answers: [
        {
            answer: str,
            isCorrect: boolean,
        }
    ]
}'''

In [60]:
topics = '''1. questions about the personalities of the characters
2. questions about the appearance or look of characters
3. what the characters did in certain situations
4. what the characters did in certain situations, what they thought or felt
5. the attitude of some characters to other characters
6. names of cities or locations where the events took place
7. what are the key characters'''

### Prompt

In [61]:
split_text_prompt = '''Your task is to read a part of the book carefully and create only ONE question for the children who have read this part. 
Question should have 4 answer choices with 1 correct answer.

Here are some topics to create a question:
1. questions about the personalities of the characters
2. questions about the appearance or look of characters
3. what the characters did in certain situations
4. what the characters did in certain situations, what they thought or felt
5. the attitude of some characters to other characters
6. names of cities or locations where the events took place
7. what are the key characters

Try to make question and answer choices more similar to the text itself.
Avoid creating tricky questions.

The question must be in this json format:
```
{}
```

The textPart field of the question should contain a part of the text from which you took the question

Here is a part of a book:
{}

The question must be in the same language as the book
'''

In [62]:
gen_prompt_template = '''You are an experienced educational content creator specializing in reading comprehension exercises for children. Your task is to create a single multiple-choice question based on a given book excerpt.

Here's the book excerpt you'll be working with:

<book_excerpt>
{}
</book_excerpt>

Now, let's review the list of topics that may be relevant to the question you'll create:

<topics_list>
{}
</topics_list>

Your task is to carefully read this excerpt and create ONE question for children who have read this part. Follow these guidelines:

1. Create a question that is closely related to the content of the text.
2. Generate 4 answer choices, with only 1 correct answer.
3. Ensure that the question and answer choices are similar in language and style to the text itself.
4. Avoid creating tricky or intentionally misleading questions or answers.
5. The question must be in the same language as the book excerpt.

You must format your output as a JSON object. Here's the required format:

<json_format>
{}
</json_format>

Important: The "textPart" field of the question should contain the specific part of the text from which you derived the question.

Before providing your final answer, wrap your thought process inside question_development tags. Consider the following steps:

1. Write down 2-3 key quotes from the excerpt that could be potential question sources.
2. List out 3-4 potential topics from the provided topic list that relate to these quotes.
3. For each potential question:
   - Formulate a clear and concise question.
   - Write down the correct answer and three plausible but incorrect answers.
   - Evaluate the question based on clarity, relevance, and difficulty level.
4. Choose the best question based on your evaluation.
5. Select the relevant text part that corresponds to your chosen question.

After your thought process, provide the final JSON output with your question, answers, and relevant text part.'''

In [63]:
gen_prompt_template_2 = '''You are an experienced educational content creator specializing in reading comprehension exercises for children. Your task is to create a single, clear, and specific multiple-choice question based on a given book excerpt.

First, carefully read the following book excerpt:

<book_excerpt>
{}
</book_excerpt>

Now, review this list of topics that may be relevant to the question you'll create:

<topics_list>
{}
</topics_list>

Your goal is to create ONE question for children who have read this excerpt. Follow these guidelines:

1. The question must be closely related to the content of the text.
2. Generate 4 answer choices, with only 1 correct answer.
3. Use language and style similar to the text itself.
4. Avoid creating tricky or intentionally misleading questions or answers.
5. The question must be in the same language as the book excerpt.
6. Ensure the question is clear and specific, explicitly mentioning any relevant context from the excerpt.

Before creating your final question, wrap your reasoning process in <question_development> tags. Follow these steps:

1. Identify 2-3 key quotes from the excerpt that could be potential question sources. Write these quotes down verbatim.
2. List 3-4 relevant topics from the provided topic list that relate to these quotes.
3. For each potential question:
   - Formulate a clear and concise question, ensuring it includes specific context.
   - Write the correct answer and three plausible but incorrect answers.
   - Evaluate the question based on clarity, relevance, and difficulty level.
   - Consider the age-appropriateness of the question for children.
   - Assess how well the question aligns with the provided topics.
4. Choose the best question based on your evaluation.
5. Select the relevant text part that corresponds to your chosen question.
6. Explain why the chosen question is the best option, considering all factors evaluated.

After your question development process, provide the final output as a JSON object. Here's the required format:

<json_format>
{}
</json_format>

Remember, the "textPart" field should contain the specific part of the text from which you derived the question.'''

# Utils

## Get formatted splitted text prompt

In [64]:
def get_split_text_prompt(text_part: str):
    return split_text_prompt.format(split_text_format, text_part)

In [65]:
def get_gen_prompt(text_part: str):
    return gen_prompt_template_2.format(text_part, topics, split_text_format)

## Remove new lines

In [66]:
def remove_extra_newlines(text):
    return re.sub(r'\n+', '\n', text)

## Read PDF

In [67]:
def read_pdf(file_path: str):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
            return text
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

## Read EPUB

In [68]:
def read_epub(file_path):
    book = epub.read_epub(file_path)
    text_content = []

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.get_body_content(), 'html.parser')
            text_content.append(soup.get_text())

    text = '\n'.join(text_content)
    return remove_extra_newlines(text)

## Split text

In [69]:
def split_text_into_parts(text: str, num_parts: int):
    paragraphs = re.split('\n', text.strip())
    
    paragraphs_per_part = math.ceil(len(paragraphs) / (num_parts))
    
    parts = []
    current_part = ""
    
    for i, paragraph in enumerate(paragraphs):
        current_part += paragraph + "\n"
        if (i + 1) % paragraphs_per_part == 0 or (i + 1) == len(paragraphs):
            parts.append(current_part.strip())
            current_part = ""
    
    return parts

## Split text with overlap

In [70]:
def split_text_into_parts_with_overlap(text: str, num_parts: int):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text.strip())
    
    sentences_per_part = math.ceil(len(sentences) / num_parts)
    
    parts = []
    for i in range(num_parts):
        start_index = max(0, i * sentences_per_part - 1)
        end_index = min(len(sentences), (i + 1) * sentences_per_part + 1)
        
        part = " ".join(sentences[start_index:end_index])
        parts.append(part.strip())
    
    return parts

## GPT generate

In [71]:
def deepseek_generate_answer(client: OpenAI, prompt: str, model: str):
    chat_completion = client.chat.completions.create(
        model = model,
        messages = [
            {"role": "system", "content": "You are a tests generator"},
            {"role": "user", "content": prompt},
        ],
        stream=False
    )

    return chat_completion

# Generate answers

In [72]:
text = read_epub('../books/vovchok-marko-vedmid-sestrychka-melasia1976.epub')
parts = split_text_into_parts(text, 10)
input_tokens = 0
output_tokens = 0
for part in parts:
    prompt = get_gen_prompt(part)
    response = deepseek_generate_answer(client, prompt, model)
    input_tokens += response.usage.prompt_tokens
    output_tokens += response.usage.completion_tokens
    print(response.choices[0].message.content)
    print("================================")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [25]:
print(input_tokens)
print(output_tokens)

10491
8213


In [26]:
# print(response)

In [27]:
# print(prompt)

In [28]:
# print(len(parts))

In [29]:
text = read_epub('../books/kobylianska-olha-iulianivna-Valse-melancolique-melankholiynyy-vals2457.epub')
parts = split_text_into_parts(text, 30)
input_tokens = 0
output_tokens = 0
for part in parts:
    prompt = get_gen_prompt(part)
    response = deepseek_generate_answer(client, prompt, model)
    input_tokens += response.usage.prompt_tokens
    output_tokens += response.usage.completion_tokens
    print(response.choices[0].message.content)
    print("================================")

<question_development>
1. **Key Quotes:**
   - "Не можу слухати меланхолійної музики. А вже найменше такої, що приваблює зразу душу ясними, до танцю визиваючими граціозними звуками, а відтак, зрікаючися їх незамітно, ллється лиш одною широкою струєю смутку!"
   - "Обнімала би тоді цілий світ, заявляючи далеко-широко, що музика грає!"
   - "Бути уже майже укінченою артисткою і працювала саме над одним образом, який хотіла продати і поїхати до Італії, щоб побачити тамошню штуку."

2. **Relevant Topics:**
   - 1. questions about the personalities of the characters
   - 3. what the characters did in certain situations
   - 4. what the characters did in certain situations, what they thought or felt
   - 5. the attitude of some characters to other characters

3. **Potential Questions:**

   a. **Question:** Як музика впливає на слухачку в тексті?
   
      - **Correct Answer:** Вона розпадається в чуття і не може опертися настроєві суму.
      - **Incorrect Answers:** 
        1. Вона стає д

In [30]:
print(input_tokens)
print(output_tokens)

42139
25196
