In [1]:
import torch
import re
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel, pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from rank_bm25 import BM25Okapi
from langchain.text_splitter import CharacterTextSplitter
from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [69]:
# remove with regex the brackets and its content from a phrase
def remove_references(text):
    # strip sentenece
    text = text.lower().strip()
    # remove strange characters from documents
    text = text.replace('\xad', '')
    # remove dashes
    text = text.replace('-', '')
    # remove new lines
    text = text.replace('\n', '')
    # remove strange combinations
    text = text.replace('=-', '')
    # remove brackets
    text = re.sub(r'\(\d+\)', '', text)
    # remove figures
    text = re.sub(r'\(\w+ \d+\)', '', text)
    # remove references
    return re.sub(r'\[[\d\- ,]+\]', '', text)

In [70]:
documents = SimpleDirectoryReader('./Knowledge Base/').load_data();

In [71]:
corpus = [document.text for document in documents]

In [72]:
text_splitter = CharacterTextSplitter(        
    separator = ".",
    chunk_size = 300,
    chunk_overlap  = 150,
    length_function = len,
)

In [73]:
texts = text_splitter.create_documents(corpus);
texts = [remove_references(sentence.page_content) for sentence in texts]

Created a chunk of size 312, which is longer than the specified 300
Created a chunk of size 435, which is longer than the specified 300
Created a chunk of size 353, which is longer than the specified 300
Created a chunk of size 385, which is longer than the specified 300
Created a chunk of size 316, which is longer than the specified 300
Created a chunk of size 330, which is longer than the specified 300
Created a chunk of size 314, which is longer than the specified 300
Created a chunk of size 347, which is longer than the specified 300
Created a chunk of size 355, which is longer than the specified 300
Created a chunk of size 329, which is longer than the specified 300
Created a chunk of size 320, which is longer than the specified 300
Created a chunk of size 302, which is longer than the specified 300
Created a chunk of size 378, which is longer than the specified 300
Created a chunk of size 340, which is longer than the specified 300
Created a chunk of size 301, which is longer tha

In [74]:
df = pd.DataFrame({'passages': texts})
df.to_csv('new_passages.csv', index=False)

In [63]:
df = pd.read_csv('new_passages.csv')

In [64]:
# Load model from HuggingFace Hub
similarity_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
similarity_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [65]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [66]:
# questions = df.question.to_list()
# Tokenize sentences
encoded_input = similarity_tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = similarity_model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = sentence_embeddings.detach().numpy()

In [21]:
tokenized_sentences = [text.split(" ") for text in texts]
bm25 = BM25Okapi(tokenized_sentences)

In [2]:
LLM = pipeline(
    model="databricks/dolly-v2-3b", 
    torch_dtype=torch.bfloat16, 
    trust_remote_code=True,
    return_full_text=True,
    device_map="auto",
    task="text-generation"
)

In [3]:
# template for an instrution with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}")

# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

hf_pipeline = HuggingFacePipeline(pipeline=LLM)

llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)

In [32]:
query = "How to maintain marble with cracks"
tokenized_query = similarity_tokenizer(query, padding=True, truncation=True, return_tensors='pt')
embedded_query = similarity_model(**tokenized_query)
question_embeddings = mean_pooling(embedded_query, tokenized_query['attention_mask'])
question_embeddings = question_embeddings.detach().numpy()
scores = cosine_similarity([question_embeddings[0]], sentence_embeddings)[0]
max_pos = np.argmax(scores[1:])
max_score = scores[max_pos+1]
similar_question = texts[max_pos+1]

tokenized_query = query.split(" ")
answer_scores = bm25.get_scores(tokenized_query)
max_score_bm25 = answer_scores.max()
top_answer = bm25.get_top_n(tokenized_query, texts, n=1)[0]

print(f"Query: {query}")
print('------------- Similarity NN -------------')
print(f"Similarity score: {max_score*100:.2f}%")
print(f"Similar question: {similar_question}")
print(f"Context: {texts[max_pos+1]}")
print('------------- Similarity BM25 -------------')
print(f"Similarity score: {max_score_bm25}")
print(f"Top answer: {top_answer}")

Query: How to maintain marble with cracks
------------- Similarity NN -------------
Similarity score: 69.20%
Similar question: what is used to clean marble surfaces?
Context: a mild, phneutral, nonabrasive soap should be used for cleaning marble surfaces. wipe with a soft foam cotton or rag.salt crystallization crystallization of salts within the pores of stones can generate sufficient stresses to cause the cracking of stone, often into powder fragments
------------- Similarity BM25 -------------
Similarity score: 7.432180800723318
Top answer: old plaster removal after consolidation, older mortars that have failed or have caused cracks and other sideeffects are removed by mechanical means. the plaster is not removed in parts of the surface, where they can cause further damage to marble


In [67]:
query = "What creates sulfation?"
tokenized_query = similarity_tokenizer(query, padding=True, truncation=True, return_tensors='pt')
embedded_query = similarity_model(**tokenized_query)
question_embeddings = mean_pooling(embedded_query, tokenized_query['attention_mask'])
question_embeddings = question_embeddings.detach().numpy()
scores = cosine_similarity([question_embeddings[0]], sentence_embeddings)[0]
max_pos = np.argmax(scores[1:])
max_score = scores[max_pos+1]
context = texts[max_pos+1]

print(f'Question: {query}')
print(f'Score: {max_score*100:.2f}')
print(f'Context: {context} \n')
if max_score <= 0.4:
    print("We do not have such context in our knowledge base. Answering with AI without providing it with context, make sure to search the correct answer with critical thinking and research.")
    print(llm_chain.predict(instruction=query).lstrip())
elif max_score <= 0.65 and max_score > 4:
    print("Sorry, i am not exactly sure based on my knowledge base, answering with very low confidence...")
    print(llm_context_chain.predict(instruction=query, context='context').lstrip())
elif max_score <= 0.9 and max_score > 0.65:
    print("Based on the knowledge from database, generating answer...")
    print(llm_context_chain.predict(instruction=query, context=context).lstrip())
else:
    print("Similar question was found with high confidence")
    print(f"Answer: {context}")

Question: What creates sulfation?
Score: 50.42
Context: for example, limestone can react with sulfur dioxide to ultimately produce calcium sulfate. other sources of salts include ground water , airborne salts , sea spray , chemical cleaners , and deicing salts .aqueous dissolution carbonate sedimentary stones e.g 

Similar question was found with high confidence
Answer: for example, limestone can react with sulfur dioxide to ultimately produce calcium sulfate. other sources of salts include ground water , airborne salts , sea spray , chemical cleaners , and deicing salts .aqueous dissolution carbonate sedimentary stones e.g


In [15]:
query = 'Is kythnos a beautiful island?'
llm_chain.predict(instruction=query).lstrip()

'Yes, Kythnos is an extremely beautiful island. People travel from all over the world to visit Kythnos and the beautiful island weather, scenic views, delicious food, and friendly people of the locals makes it a must visit island.'

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 2.54k/2.54k [00:00<00:00, 1.27MB/s]
Downloading: 100%|██████████| 792k/792k [00:00<00:00, 2.33MB/s]
Downloading: 100%|██████████| 2.42M/2.42M [00:00<00:00, 2.93MB/s]
Downloading: 100%|██████████| 2.20k/2.20k [00:00<00:00, 2.20MB/s]
Downloading: 100%|██████████| 662/662 [00:00<00:00, 662kB/s]
Downloading: 100%|██████████| 3.13G/3.13G [01:12<00:00, 43.0MB/s] 
Downloading: 100%|██████████| 147/147 [00:00<00:00, 147kB/s]
