# Tags analysis

In [13]:
import json

In [14]:
with open("../data/tags/Sorceleur - L'Integrale - Andrzej Sapkowski.json") as f:
    data = json.load(f)

In order to validate an entity_group : if the best represented class represents only a small percentage, double check with a LLM

In [15]:
classes = sorted(list(data['tags'].keys()))
print(classes)

entities = dict()
for eg_key, eg_value in data['tags'].items():
    for tag_key, tag_value in eg_value.items():
        if tag_key not in entities.keys():
            entities[tag_key] = {}
        entities[tag_key][eg_key] = {
            'count': tag_value['count'],
            'median': tag_value['median'],
        }

entity_name_entity_groups_association = []
for entity_name, entity_info in entities.items():
    total_tags = sum([info['count'] for info in entity_info.values()])
    eg_tuples = []
    for entity_group, entity_group_info in entity_info.items():
        eg_tuples.append((entity_group, entity_group_info['count'], entity_group_info['count']/total_tags))
    eg_tuples = sorted(eg_tuples, reverse=True, key=lambda x: x[2])
    entity_name_entity_groups_association.append((entity_name, total_tags, eg_tuples))

entity_name_entity_groups_association = sorted(entity_name_entity_groups_association, reverse=True, key=lambda x: x[1])

for elem in entity_name_entity_groups_association:
    print(f'{elem[0]} : {" ".join([str(t) for t in elem[2]])}')


['CONSUMER_GOOD', 'EVENT', 'LOCATION', 'ORGANIZATION', 'OTHER', 'PERSON', 'WORK_OF_ART']
Geralt : ('PERSON', 3652, 0.9991792065663475) ('WORK_OF_ART', 2, 0.0005471956224350205) ('LOCATION', 1, 0.00027359781121751026)
Ciri : ('PERSON', 2193, 0.9986338797814208) ('LOCATION', 1, 0.0004553734061930783) ('OTHER', 1, 0.0004553734061930783) ('WORK_OF_ART', 1, 0.0004553734061930783)
Jaskier : ('PERSON', 1465, 0.9938941655359566) ('CONSUMER_GOOD', 6, 0.004070556309362279) ('OTHER', 3, 0.0020352781546811396)
Yennefer : ('PERSON', 1139, 0.993025283347864) ('LOCATION', 7, 0.006102877070619006) ('OTHER', 1, 0.0008718395815170009)
Milva : ('PERSON', 477, 0.9173076923076923) ('LOCATION', 34, 0.06538461538461539) ('OTHER', 5, 0.009615384615384616) ('CONSUMER_GOOD', 2, 0.0038461538461538464) ('ORGANIZATION', 2, 0.0038461538461538464)
Nilfgaard : ('PERSON', 240, 0.6233766233766234) ('LOCATION', 134, 0.34805194805194806) ('ORGANIZATION', 11, 0.02857142857142857)
Triss : ('PERSON', 368, 0.997289972899729)

# Weaviate configuration

In [16]:
import weaviate
weaviate_config = {
    'http_host': "192.168.1.103",
    'http_port': 8080,
    'http_secure': False,
    'grpc_host': "192.168.1.103",
    'grpc_port': 50051,
    'grpc_secure': False
}

In [76]:
with weaviate.connect_to_custom(**weaviate_config) as client:
    print(client.get_meta())

{'hostname': 'http://[::]:8080', 'modules': {}, 'version': '1.24.0-rc.0'}


In [77]:
with weaviate.connect_to_custom(**weaviate_config) as client:
    print(client.collections.list_all())

{'Book_parts': _CollectionConfigSimple(name='Book_parts', description=None, generative_config=None, properties=[_Property(name='book_id', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'), _Property(name='parent_id', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'), _Property(name='identifiers', description=None, data_type=<DataType.TEXT_ARRAY: 'text[]'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'), _Property(name='play_orders', description=None, data_type=<DataType.NUMBER_ARRAY: 'number[]'>, index_filterable=True, index_searchable=False, nested_properties=None, tok

In [78]:
import weaviate.classes.config as wc

with weaviate.connect_to_custom(**weaviate_config) as client:
    if not 'Book_metadata' in client.collections.list_all().keys():
        client.collections.create(
            name='Book_metadata',
            properties=[
                wc.Property(name='identifier', data_type=wc.DataType.TEXT),
                wc.Property(name='title', data_type=wc.DataType.TEXT),
                wc.Property(name='language', data_type=wc.DataType.TEXT),
                wc.Property(name='creator', data_type=wc.DataType.TEXT),
            ]
        )
    if not 'Book_parts' in client.collections.list_all().keys():
        client.collections.create(
            name='Book_parts',
            properties=[
                wc.Property(name='book_id', data_type=wc.DataType.TEXT),
                wc.Property(name='parent_id', data_type=wc.DataType.TEXT),
                wc.Property(name='identifiers', data_type=wc.DataType.TEXT_ARRAY),
                wc.Property(name='play_orders', data_type=wc.DataType.NUMBER_ARRAY),
                wc.Property(name='labels', data_type=wc.DataType.TEXT_ARRAY),
                wc.Property(name='content_path', data_type=wc.DataType.TEXT),
                wc.Property(name='content_ids', data_type=wc.DataType.TEXT_ARRAY)
            ]
        )
    if not 'Chunks_250_50' in client.collections.list_all().keys():
        client.collections.create(
            name='Chunks_250_50',
            properties=[
                wc.Property(name='parent_id', data_type=wc.DataType.TEXT),
                wc.Property(name='chunk_number', data_type=wc.DataType.NUMBER),
            ]
        )

# Embedding

In [79]:
from dotenv import load_dotenv
from pprint import pp
from openai import OpenAI
import os
load_dotenv()

def embedd_text_chunks(text_chunks):
    client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
    response = client.embeddings.create(
        input=text_chunks,
        model="text-embedding-3-small",
        dimensions=1536
    )
    
    return [list(elem.embedding) for elem in response.data]
    


# Chunking

In [80]:
import re

with open('../data/extracted_books/1 - Le Dernier Voeu - Sapkowski, Andrzej.json') as f:
    data = json.load(f)

first_chapter = re.sub(r'(\n{3,})', r'\n\n\n', data['data'][0]['content'])


In [81]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n\n", "\n\n", "\n", ".", ",", " ", ""],
    keep_separator=False,
    chunk_size = 250,
    chunk_overlap = 50,
    length_function=len
)

chunks = splitter.split_text(first_chapter)

# Weaviate upload

In [83]:
from weaviate.util import generate_uuid5

metadata = data['metadata']
content = data['data']

book_uuid = generate_uuid5(metadata['identifier'])

with weaviate.connect_to_custom(**weaviate_config) as client:
    book_metadata_collection = client.collections.get('Book_metadata')
    book_parts_collection = client.collections.get('Book_parts')
    chunks_250_50_collection = client.collections.get('Chunks_250_50_collection')

    with book_metadata_collection.batch.dynamic() as batch:
        book_metadata_obj = {
            "identifier": book_uuid,
            "title": metadata['title'],
            "language": metadata['language'],
            "creator": metadata['creator']
        }
        batch.add_object(
            properties=book_metadata_obj,
            uuid=book_uuid,
        )
    
    if len(book_metadata_collection.batch.failed_objects) > 0:
        print(f"Failed to import {len(book_metadata_collection.batch.failed_objects)} objects")