# Init

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, SafetySetting
from pprint import pprint as pp
import json
import time
import re
import math
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import PyPDF2

In [56]:
vertexai.init(project='diplom-440610', location='us-central1')

In [None]:
gemini15pro = GenerativeModel("gemini-2.0-flash")

# Prompts

## Splitted text prompt

### Format

In [58]:
split_text_format = '''{
    question: str,
    textPart: str,
    answers: [
        {
            answer: str,
            isCorrect: boolean,
        }
    ]
}'''

### Prompt

In [59]:
split_text_prompt = '''Your task is to read a part of the book carefully and create only ONE question for the children who have read this part. 
Question should have 4 answer choices with 1 correct answer.

Here are some topics to create a question:
1. questions about the personalities of the characters
2. questions about the appearance of the characters
3. what the characters did in certain situations
4. what the characters did in certain situations, what they thought or felt
5. the attitude of some characters to other characters
6. names of cities or localitions where the events took place
7. what are the key characters

Try to make question and answer choices more similar to the text itself.
Avoid creating tricky questions.

The question must be in this json format:
```
{}
```

The textPart field of the question should contain a part of the text from which you took the question

Here is a part of a book:
{}

The question must be in the same language as the book
'''

# Utils

## Get formatted splitted text prompt

In [60]:
def get_split_text_prompt(text_part: str):
    return split_text_prompt.format(split_text_format, text_part)

## Remove new lines

In [61]:
def remove_extra_newlines(text):
    return re.sub(r'\n+', '\n', text)

## Read PDF

In [62]:
def read_pdf(file_path: str):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
            return text
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

## Read EPUB

In [63]:
def read_epub(file_path):
    book = epub.read_epub(file_path)
    text_content = []

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.get_body_content(), 'html.parser')
            text_content.append(soup.get_text())

    text = '\n'.join(text_content)
    return remove_extra_newlines(text)

## Split text

In [64]:
def split_text_into_parts(text: str, num_parts: int):
    paragraphs = re.split('\n', text.strip())
    
    paragraphs_per_part = math.ceil(len(paragraphs) / (num_parts))
    
    parts = []
    current_part = ""
    
    for i, paragraph in enumerate(paragraphs):
        current_part += paragraph + "\n"
        if (i + 1) % paragraphs_per_part == 0 or (i + 1) == len(paragraphs):
            parts.append(current_part.strip())
            current_part = ""
    
    return parts

## Split text with overlap

In [65]:
def split_text_into_parts_with_overlap(text: str, num_parts: int):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text.strip())
    
    sentences_per_part = math.ceil(len(sentences) / num_parts)
    
    parts = []
    for i in range(num_parts):
        start_index = max(0, i * sentences_per_part - 1)
        end_index = min(len(sentences), (i + 1) * sentences_per_part + 1)
        
        part = " ".join(sentences[start_index:end_index])
        parts.append(part.strip())
    
    return parts

## Gemini generate

In [66]:
generation_config = {
    "temperature": 0.0,
    "response_mime_type": "application/json"
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
    ),
]


def gemini_generate_answer(model: GenerativeModel, prompt: str):
    response = model.generate_content(prompt,
                                      generation_config=generation_config,
                                      safety_settings=safety_settings)

    return response

# Generate answers

## Splitted text

In [67]:
text = read_epub('books/vovchok-marko-vedmid-sestrychka-melasia1976.epub')
parts = split_text_into_parts(text, 10)
for part in parts:
    prompt = get_split_text_prompt(part)
    response = gemini_generate_answer(gemini15pro, prompt)  
    print(response.text)
    print("================================")
    time.sleep(30)

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


{
"question": "Від кого прийшов лист Меласці?",
"textPart": "Від Падолиста:",
"answers": [
{
"answer": "Від матері",
"isCorrect": false
},
{
"answer": "Від Падолиста",
"isCorrect": true
},
{
"answer": "Від брата",
"isCorrect": false
},
{
"answer": "Від бабусі",
"isCorrect": false
}
]
}

{
    "question": "Як звали козака, якому належала пасіка?",
    "textPart": "Оце ж козак старий – звали його Загайний, – погородивши загородь, постановив вуллів чимало – було їх там наче снопків на добрій ниві; і вже не полічу, мабуть, скільки багато тепер у його їх придбано, – мабуть, багато та й пребагато, бо рої там роїлися виборно так, що тільки не подивуйте у лиху годину, виборно роїлися!",
    "answers": [
        {
            "answer": "Загайний",
            "isCorrect": true
        },
        {
            "answer": "Михайло",
            "isCorrect": false
        },
        {
            "answer": "Меласій",
            "isCorrect": false
        },
        {
            "answer": "Не згад

In [70]:
print(len(parts))

9


In [68]:
# text = read_epub('books/vovchok-marko-vedmid-sestrychka-melasia1976.epub')
# parts = split_text_into_parts(text, 10)
# for part in parts:
#     print(part)
#     print("================================")

In [69]:
# text = read_epub('books/vovchok-marko-vedmid-sestrychka-melasia1976.epub')
# parts = split_text_into_parts_with_overlap(text, 10)
# for part in parts:
#     print(part)
#     print("================================")