# Split data into csv rows

In [57]:
import pandas as pd
import torch
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# remove with regex the brackets and its content from a phrase
def clean_text(text):
    # strip sentenece
    text = text.lower().strip()
    # remove tabs
    text = text.replace('\t', '')
    # remove new lines
    text = text.replace('\n', '')
    return text

In [29]:
# read txt file
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [42]:
item_data = read_txt_file('./KnowledgeBase/Sanctuary_of_the_Middle_Plateau/building5.txt')

In [43]:
text_splitter = CharacterTextSplitter(    
    separator = ".",
    chunk_size = 150,
    chunk_overlap  = 50,
    length_function = len,
)

In [44]:
passages = text_splitter.create_documents([item_data]);
passages = [clean_text(sentence.page_content) for sentence in passages]

Created a chunk of size 361, which is longer than the specified 150
Created a chunk of size 453, which is longer than the specified 150
Created a chunk of size 166, which is longer than the specified 150
Created a chunk of size 274, which is longer than the specified 150
Created a chunk of size 181, which is longer than the specified 150
Created a chunk of size 257, which is longer than the specified 150
Created a chunk of size 340, which is longer than the specified 150


In [45]:
passages

['in addition to the buildings connected with the religious life of the city, an important public building that came to light is the building complex that was revealed on the northwest side of the middle plateau, on the plateau that develops lower than the archaic temple (building three) to the west, where it was probably located the market of hellenistic times',
 'it is a two-story building of the late classical period, measuring seventeen by ten meters. it presents at least two main architectural phases',
 'the oldest dates back to around the middle of the fourth century',
 'the building seems to have suffered some destruction and significant repair during the hellenistic period, probably in the second century',
 'a monumental staircase in the middle of the building led to the first floor, where the apparently official apartments were located: room a with the liturgical hearth, the b-c with paved floor (and partly shaped natural rock) probably used for meals, north west room g whose 

In [46]:
# create dataframe
df = pd.DataFrame({'passages': passages})

In [47]:
df.to_csv('./KnowledgeBase/Sanctuary_of_the_Middle_Plateau/building5_passages.csv', index=False)

In [49]:
# Load model from HuggingFace Hub
similarity_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
similarity_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
print('INFO:     Loaded Similarity Model')

INFO:     Loaded Similarity Model


In [52]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [54]:
building1_passages = pd.read_csv('../data/building1_passages.csv')['passages'].to_list()
building2_passages = pd.read_csv('../data/building2_passages.csv')['passages'].to_list()
building3_passages = pd.read_csv('../data/building3_passages.csv')['passages'].to_list()
building5_passages = pd.read_csv('../data/building5_passages.csv')['passages'].to_list()

building_passages = [building1_passages, building2_passages, building3_passages, building5_passages]
building_embeddings = []
for i, passages in enumerate(building_passages):
    encoded_input = similarity_tokenizer(building_passages[i], padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = similarity_model(**encoded_input)
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    building_embeddings.append(embeddings.detach().numpy())

In [55]:
building_embeddings[1]

array([[ 0.11622969,  0.03409467,  0.05019226, ...,  0.05629298,
         0.11818867,  0.02524624],
       [-0.11978338,  0.40054923, -0.1928807 , ...,  0.07970957,
         0.32337907,  0.02805086],
       [ 0.38470966,  0.06254954,  0.06127506, ...,  0.5766263 ,
        -0.35689756, -0.12071333],
       ...,
       [-0.021469  ,  0.2774418 ,  0.11278592, ..., -0.01957736,
        -0.28363362,  0.17731604],
       [ 0.27422208,  0.22551264,  0.19516474, ...,  0.02150778,
        -0.09390647, -0.00687842],
       [-0.23764972,  0.19658478, -0.49915496, ...,  0.04694569,
         0.15331982, -0.00156602]], dtype=float32)

In [66]:
question = "To when it is dated?"
tokenized_query = similarity_tokenizer(question, padding=True, truncation=True, return_tensors='pt')
embedded_query = similarity_model(**tokenized_query)
question_embedding = mean_pooling(embedded_query, tokenized_query['attention_mask'])
question_embedding = question_embedding.detach().numpy()
similarities = cosine_similarity(question_embedding, building_embeddings[1])
most_similar_passage_index = np.argmax(similarities)
building_passages[1][most_similar_passage_index]

'buildings one and two seem to be dated to the late classical to early hellenistic times, a period where there seems to be a plan to commemorate the upper city, with the construction of monumental buildings'

In [69]:
similarities.max()

0.14941174