# Split data into csv rows

In [1]:
import pandas as pd
import os
import torch
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# remove with regex the brackets and its content from a phrase
def clean_text(text):
    # strip sentenece
    text = text.lower().strip()
    # remove tabs
    text = text.replace('\t', '')
    # remove new lines
    text = text.replace('\n', '')
    return text

In [3]:
# read txt file
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [4]:
text_splitter = CharacterTextSplitter(    
    separator = ".",
    chunk_size = 150,
    chunk_overlap  = 50,
    length_function = len,
)

In [5]:
# Os walk through the knowledge base folder
for root, dirs, files in os.walk("./KnowledgeBase/"):
    for file in files:
        # if it is txt
        if file.endswith(".txt"):
            filename = file.split('.txt')[0]
            item_data = read_txt_file(os.path.join(root, file))
            passages = text_splitter.create_documents([item_data]);
            passages = [clean_text(sentence.page_content) for sentence in passages]
            # create dataframe
            df = pd.DataFrame({'passages': passages})
            df.to_csv(f'../data/{filename}_passages.csv', index=False)

Created a chunk of size 166, which is longer than the specified 150
Created a chunk of size 160, which is longer than the specified 150
Created a chunk of size 207, which is longer than the specified 150
Created a chunk of size 197, which is longer than the specified 150
Created a chunk of size 223, which is longer than the specified 150
Created a chunk of size 284, which is longer than the specified 150
Created a chunk of size 219, which is longer than the specified 150
Created a chunk of size 204, which is longer than the specified 150
Created a chunk of size 255, which is longer than the specified 150
Created a chunk of size 228, which is longer than the specified 150
Created a chunk of size 227, which is longer than the specified 150
Created a chunk of size 301, which is longer than the specified 150
Created a chunk of size 211, which is longer than the specified 150
Created a chunk of size 238, which is longer than the specified 150
Created a chunk of size 349, which is longer tha

In [18]:
item_data = read_txt_file('./KnowledgeBase/South_Structures/necropolis.txt')

In [20]:
passages = text_splitter.create_documents([item_data]);
passages = [clean_text(sentence.page_content) for sentence in passages]

Created a chunk of size 168, which is longer than the specified 150
Created a chunk of size 166, which is longer than the specified 150
Created a chunk of size 208, which is longer than the specified 150
Created a chunk of size 179, which is longer than the specified 150
Created a chunk of size 189, which is longer than the specified 150


In [21]:
passages

['the ancient city had two necropolises outside the walls',
 'the main cemetery of the ancient city occupies a large area outside the southern part of the wall, while groups of tombs can also be seen outside the northwestern gate',
 'from the excavations of the years from one thousand ninty one to ninty six, to an area of approximately four hundred square meter sixty eight (68) burials were found',
 'out of the 68 burials that came to light, 62 were found arrested and only 6 unclaimed',
 "out of the total number of graves, 41 definitely belong to adults, 17 graves that were found very damaged seem to have belonged to adults as well, while only 8 of the total were children's and two of infants",
 'few skeletons are preserved in good condition, so it is difficult to identify the sex and age of the individuals',
 "of the total number of graves, 66 are box-shaped carved into the natural rock and there is only one inhumation in a sharp-bottomed amphora, although a child's burial in a beehiv

In [22]:
# create dataframe
df = pd.DataFrame({'passages': passages})

In [23]:
df.to_csv('./KnowledgeBase/South_Structures/necropolis_passages.csv', index=False)