# Test LLM with a text2speech output

In [1]:
import torch
import re
import numpy as np
import sounddevice as sd


from datasets import load_dataset
from rank_bm25 import BM25Okapi
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from llama_index import SimpleDirectoryReader
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# remove with regex the brackets and its content from a phrase
def clean_text(text):
    # strip sentenece
    text = text.lower().strip()
    # remove tabs
    text = text.replace('\t', '')
    # remove new lines
    text = text.replace('\n', '')
    return text

In [16]:
# read txt file
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [17]:
building_data = read_txt_file('./knowledge base/Sanctuary of the Middle Plateau/building 1.txt')

In [18]:
text_splitter = CharacterTextSplitter(    
    separator = ".",
    chunk_size = 300,
    chunk_overlap  = 150,
    length_function = len,
)

In [19]:
building_passages = text_splitter.create_documents([building_data]);
building_passages = [clean_text(sentence.page_content) for sentence in building_passages]

Created a chunk of size 301, which is longer than the specified 300
Created a chunk of size 349, which is longer than the specified 300
Created a chunk of size 551, which is longer than the specified 300


In [20]:
building_passages

['the southernmost one (building 1) is a cult building in which, based on the sculptures and inscriptions that emerged from both the interior of the building and the adjacent reservoir, the worship of the god asklepios and the goddess aphrodite was housed',
 'it is possible that the building also housed the worship of the samothracian gods, based on an inscribed stele found embedded in a newer building',
 'it is an oblong building with dimensions of seventeen point fourty × eleven point fifty meters, made of slate, which consists of two independent rooms opening to the east on a doric portico, from which the building was accessed',
 'it is erected on a strong rise in the west, with a maximum surviving height of two point eighty meters. the walls of the building were plastered with red mortar which survives its original placement in some places, both internally and externally',
 'the walls of the building were plastered with red mortar which survives its original placement in some place

In [9]:
# Load model from HuggingFace Hub
similarity_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
similarity_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [10]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [21]:
# questions = df.question.to_list()
# Tokenize sentences
encoded_input = similarity_tokenizer(building_passages, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = similarity_model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = sentence_embeddings.detach().numpy()

In [22]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
speech_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [23]:
# load xvector containing speaker's voice characteristics from a dataset
speaker_voice = 2933
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[speaker_voice]["xvector"]).unsqueeze(0)

Found cached dataset cmu-arctic-xvectors (C:/Users/User/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f)


In [24]:
# generate_text = pipeline(model="databricks/dolly-v2-3b", torch_dtype=torch.bfloat16, trust_remote_code=True)

In [25]:
tokenized_sentences = [text.split(" ") for text in building_passages]
bm25 = BM25Okapi(tokenized_sentences)

In [52]:
query = "Did aphrodite was worshiped there?"
tokenized_query = similarity_tokenizer(query, padding=True, truncation=True, return_tensors='pt')
embedded_query = similarity_model(**tokenized_query)
question_embedding = mean_pooling(embedded_query, tokenized_query['attention_mask'])
question_embedding = question_embedding.detach().numpy()
scores = cosine_similarity([question_embedding[0]], sentence_embeddings)[0]
max_pos = np.argmax(scores[1:])
max_score = scores[max_pos+1]
candidate_answer = building_passages[max_pos+1]

tokenized_query = query.split(" ")
answer_scores = bm25.get_scores(tokenized_query)
max_score_bm25 = answer_scores.max()
top_answer = bm25.get_top_n(tokenized_query, building_passages, n=1)[0]

print(f"Query: {query}")
print('------------- Similarity NN -------------')
print(f"Similarity score: {max_score*100:.2f}%")
print(f"Top answer: {candidate_answer}")
print('------------- Similarity BM25 -------------')
print(f"Similarity score: {max_score_bm25}")
print(f"Top answer: {top_answer}")

if max_score < 0.5:
    candidate_answer = 'I am sorry, I do not know the answer to that question.'

Query: Did aphrodite was worshiped there?
------------- Similarity NN -------------
Similarity score: 58.49%
Top answer: in addition to the finds from the interior of the cistern, findings such as a votive inscription to aphrodite syria, built into the intermediate wall of building one in second use, a marble statue of aphrodite and the lower part of a marble female of a statue that had been found a short distance lower than buildings one and two, and probably belonged according to panagiotis themelis to the colossal statue of aphrodite in the anadyomeni type, which was crafted by the messenian sculptor damophon i make clear the co-cult of asclepius and aphrodite
------------- Similarity BM25 -------------
Similarity score: 2.23576365326113
Top answer: in addition to the finds from the interior of the cistern, findings such as a votive inscription to aphrodite syria, built into the intermediate wall of building one in second use, a marble statue of aphrodite and the lower part of a mar

In [53]:
inputs = processor(text=candidate_answer, return_tensors="pt")

In [54]:
speech = speech_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

In [55]:
sd.play(speech.numpy(), samplerate=16000)