In [5]:
import torch
import re
import pandas as pd
import numpy as np

from sys import getsizeof
from transformers import AutoTokenizer, AutoModel, pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from rank_bm25 import BM25Okapi
from langchain.text_splitter import CharacterTextSplitter
from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoder, DPRContextEncoderTokenizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# # Load similarity model
# similarity_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# similarity_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# print("Loaded similarity model!")

# # Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     # First element of model_output contains all token embeddings
#     token_embeddings = model_output[0]
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# # Read the extracted questions csv
# df = pd.read_csv('../actions/new_passages.csv')
# passages = df.passages.to_list()
# encoded_input = similarity_tokenizer(passages, padding=True, truncation=True, return_tensors='pt')
# with torch.no_grad():
#     model_output = similarity_model(**encoded_input)
# # Perform pooling
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
# sentence_embeddings = sentence_embeddings.detach().numpy()
# print("Loaded knowledge base!")


# # Load similarity model
# tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
# print("Loaded tokenizer!")
# model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
# # model.to("cuda")
# print("Loaded similarity model!")

# # Read the extracted questions csv
# df = pd.read_csv('../actions/new_passages.csv')
# passages = df.passages.to_list()
# input_ids = tokenizer(passages, return_tensors="pt")["input_ids"]
# input_ids = input_ids.to("cuda")
# sentence_embeddings = model(input_ids).pooler_output
# print("Loaded knowledge base!")

In [2]:
# remove with regex the brackets and its content from a phrase
def remove_references(text):
    # strip sentenece
    text = text.lower().strip()
    # remove strange characters from documents
    text = text.replace('\xad', '')
    # remove dashes
    text = text.replace('-', '')
    # remove new lines
    text = text.replace('\n', '')
    # remove strange combinations
    text = text.replace('=-', '')
    # remove brackets
    text = re.sub(r'\(\d+\)', '', text)
    # remove figures
    text = re.sub(r'\(\w+ \d+\)', '', text)
    # remove references
    return re.sub(r'\[[\d\- ,]+\]', '', text)

In [3]:
documents = SimpleDirectoryReader('./Knowledge Base/').load_data();

In [4]:
corpus = [document.text for document in documents]

In [5]:
text_splitter = CharacterTextSplitter(        
    separator = ".",
    chunk_size = 300,
    chunk_overlap  = 150,
    length_function = len,
)

In [6]:
texts = text_splitter.create_documents(corpus);
texts = [remove_references(sentence.page_content) for sentence in texts]

Created a chunk of size 312, which is longer than the specified 300
Created a chunk of size 435, which is longer than the specified 300
Created a chunk of size 353, which is longer than the specified 300
Created a chunk of size 385, which is longer than the specified 300
Created a chunk of size 316, which is longer than the specified 300
Created a chunk of size 330, which is longer than the specified 300
Created a chunk of size 314, which is longer than the specified 300
Created a chunk of size 347, which is longer than the specified 300
Created a chunk of size 355, which is longer than the specified 300
Created a chunk of size 329, which is longer than the specified 300
Created a chunk of size 320, which is longer than the specified 300
Created a chunk of size 302, which is longer than the specified 300
Created a chunk of size 378, which is longer than the specified 300
Created a chunk of size 340, which is longer than the specified 300
Created a chunk of size 301, which is longer tha

In [7]:
df = pd.DataFrame({'passages': texts})
df.to_csv('new_passages.csv', index=False)

In [8]:
df = pd.read_csv('new_passages.csv')

In [9]:
# Load model from HuggingFace Hub
similarity_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
similarity_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [10]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [11]:
# questions = df.question.to_list()
# Tokenize sentences
encoded_input = similarity_tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = similarity_model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = sentence_embeddings.detach().numpy()

In [12]:
tokenized_sentences = [text.split(" ") for text in texts]
bm25 = BM25Okapi(tokenized_sentences)

In [13]:
LLM = pipeline(
    model="databricks/dolly-v2-3b", 
    torch_dtype=torch.bfloat16, 
    trust_remote_code=True,
    return_full_text=True,
    device_map="auto",
    task="text-generation",
    temperature=0.9,
)

In [14]:
# template for an instrution with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}")

# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

hf_pipeline = HuggingFacePipeline(pipeline=LLM)

llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)

In [15]:
query = "Does rain affect the integrity of stone?"
tokenized_query = similarity_tokenizer(query, padding=True, truncation=True, return_tensors='pt')
embedded_query = similarity_model(**tokenized_query)
question_embeddings = mean_pooling(embedded_query, tokenized_query['attention_mask'])
question_embeddings = question_embeddings.detach().numpy()
scores = cosine_similarity([question_embeddings[0]], sentence_embeddings)[0]
max_pos = np.argmax(scores[1:])
max_score = scores[max_pos+1]
similar_answer = texts[max_pos+1]

tokenized_query = query.split(" ")
answer_scores = bm25.get_scores(tokenized_query)
max_score_bm25 = answer_scores.max()
top_answer = bm25.get_top_n(tokenized_query, texts, n=1)[0]

print(f"Query: {query}")
print('------------- Similarity NN -------------')
print(f"Similarity score: {max_score*100:.2f}%")
print(f"Context: {similar_answer}")
print('------------- Similarity BM25 -------------')
print(f"Similarity score: {max_score_bm25}")
print(f"Top answer: {top_answer}")

Query: Does rain affect the integrity of stone?
------------- Similarity NN -------------
Similarity score: 71.79%
Context: in areas where the rainwater is relatively free from pollutants, the dissolution of most common building stones is usually not a serious problem .frost damage certain stones which are exposed to freezing temperatures and wet conditions may undergo frost damage
------------- Similarity BM25 -------------
Similarity score: 10.866191278608332
Top answer: the processes involved in glass disease can reduce the transparency of the glass or even threaten the integrity of the structure


In [32]:
query = "Is kythnos a good island?"
tokenized_query = similarity_tokenizer(query, padding=True, truncation=True, return_tensors='pt')
embedded_query = similarity_model(**tokenized_query)
question_embeddings = mean_pooling(embedded_query, tokenized_query['attention_mask'])
question_embeddings = question_embeddings.detach().numpy()
scores = cosine_similarity([question_embeddings[0]], sentence_embeddings)[0]
max_pos = np.argmax(scores[1:])
max_score = scores[max_pos+1]
context = texts[max_pos+1]

print(f'Question: {query}')
print(f'Score: {max_score*100:.2f}')
print(f'Context: {context} \n')
query = 'Answer the following question only with the provided input. If no answer is found tell that you cannot answer based on this context.' + query;
if max_score <= 0.4:
    print("We do not have such context in our knowledge base. Answering with AI without providing it with context, make sure to search the correct answer with critical thinking and research.")
    print(llm_chain.predict(instruction=query).lstrip())
elif max_score <= 0.65 and max_score > 4:
    print("Sorry, i am not exactly sure based on my knowledge base, answering with very low confidence...")
    print(llm_context_chain.predict(instruction=query, context='context').lstrip())
elif max_score <= 0.9 and max_score > 0.65:
    print("Based on the knowledge from database, generating answer...")
    print(llm_context_chain.predict(instruction=query, context=context).lstrip())
else:
    print("Similar question was found with high confidence")
    print(f"Answer: {context}")

Question: Is kythnos a good island?
Score: 22.48
Context: one example is the oseberg viking ship, which was found embedded in waterlogged clay at a land site excavation in norway year 1902. another is the warship vasa, which was raised in 1961 after 333 years in the brackish and cold waters of the baltic sea 

We do not have such context in our knowledge base. Answering with AI without providing it with context, make sure to search the correct answer with critical thinking and research.




The answer is: Yes, kythnos is a good island. In fact, kythnos is considered the most beautiful island in greece. It has a lot of beaches, amazing sceneries, and it is very popular for tourism.
