In [None]:
import os
import json
from dotenv import load_dotenv
from pprint import pprint as pp
load_dotenv()

with open('../data/extracted_books/1 - Le Dernier Voeu - Sapkowski, Andrzej.json', 'r') as f:
    data = json.load(f)


In [None]:
from pprint import pprint as pp
from langchain.text_splitter import RecursiveCharacterTextSplitter

def extract_content(data, result=None):
    if result is None:
        result = []
    for item in data:
        result.append(item['content'])
        if 'children' in item:
            extract_content(item['children'], result)
    return result

text_parts = extract_content(data['data'])

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 5000,
    chunk_overlap  = 0,
    length_function = len,
    separators= ["\n\n\n", "\n\n", "\n", ".", ",", " ", ""],
    keep_separator= True,
)

split_sub_parts = text_splitter.split_text('\n\n\n'.join(text_parts[7:65]))

"""
for i, sub_part in enumerate(split_sub_parts):
    print(sub_part)
    print(f'### {len(sub_part)} ###')
    print("\n##################################################\n##################################################\n##################################################\n")
"""


In [54]:
from openai import OpenAI
import json
import re

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

knowledge_base = {}

with open(f'../core/prompts/fr.json', 'r') as f:
    data = json.load(f)
    entity_summarization_with_kb_prompt = data['entity_summarization_with_kb_prompt']
    entity_kb_template = data['entity_kb_template']

def parse_json_output(llm_response: str) -> dict:
    json_str = re.search(r'\[.*\]', llm_response, re.DOTALL)
    if json_str:
        json_str = json_str.group(0)
        return json.loads(json_str)
    else:
        raise ValueError("No JSON list object found in the LLM response")

def build_knowledge_base_for_text_part(text_part: str, knowledge_base: dict) -> str:
    filtered_kb = {}
    for entity_name, facts in knowledge_base.items():
        if entity_name in text_part:
            filtered_kb[entity_name] = facts
    
    entity_texts = []
    for entity_name, v in filtered_kb.items():
        alternative_names = v['alternative_names']
        facts = v['extractions']
        summary = '\n'.join([f"- ({fact['category']}) : {fact['fact']}" for fact in facts])
        entity_text = entity_kb_template['template'].format(entity_name=entity_name, alternative_names=alternative_names, summary=summary)
        entity_texts.append(entity_text)
    
    if entity_texts:
        return '\n\n'.join(entity_texts)
    else:
        return 'EMPTY'

for sub_part in split_sub_parts[:10]:
    print('//////////////////// TEXT ////////////////////')
    print(sub_part)

    filtered_kb_str = build_knowledge_base_for_text_part(sub_part, knowledge_base)
    computed_prompt = entity_summarization_with_kb_prompt["prompt"].format(text_part=sub_part, knowledge_base=filtered_kb_str)

    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role": "user", "content": computed_prompt}
      ]
    )
    
    json_output = parse_json_output(completion.choices[0].message.content)

    print('//////////////////// OUTPUT ////////////////////')
    pp(json_output)

    for entity_fact in json_output:
        entity_name = entity_fact['entity']
        alternative_names = entity_fact['alternative_names']
        category = entity_fact['category']
        fact = entity_fact['summary']

        if entity_name not in knowledge_base.keys():
            knowledge_base[entity_name] = {
                'alternative_names': alternative_names,
                'extractions': [{'category': category, 'fact': fact}]
            }
        else:
            knowledge_base[entity_name]['alternative_names'] = list(set(knowledge_base[entity_name]['alternative_names'] + alternative_names))
            knowledge_base[entity_name]['extractions'].append({'category': category, 'fact': fact})
    
    print('//////////////////// KB ////////////////////')
    pp(knowledge_base)
    print('\n\n\n')

//////////////////// TEXT ////////////////////
I
//////////////////// OUTPUT ////////////////////
[]
//////////////////// KB ////////////////////
{}




//////////////////// TEXT ////////////////////
Le sorceleur avait la lame sur la gorge.

Il trempait dans l'eau savonneuse, la tête renversée en arrière, appuyée sur le bord glissant de la cuve en bois. Il sentait dans sa bouche le goût amer du savon. La lame, émoussée à faire pitié, lui grattait douloureusement la pomme d'Adam, remontait sur son menton avec un léger bruit.

Le barbier, qui avait l'air d'un artiste convaincu de produire un chef-d'œuvre, repassa la lame une dernière fois, pour la touche finale, puis lui rafraîchit le visage avec un morceau de toile de lin imbibé d'une lotion qui devait être à base d'Angelica archangelica.

Geralt se mit debout, laissa un valet lui verser dessus un petit baquet d'eau pour le rincer, s'ébroua et sortit de la cuve en imprimant la trace de ses pieds mouillés sur le sol de brique.

— Voici u