# Try GPT-3 Solution

In [1]:
import os
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

from rank_bm25 import BM25Okapi
from langchain.text_splitter import CharacterTextSplitter
from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline, DPRQuestionEncoder, DPRContextEncoder
from sklearn.metrics.pairwise import cosine_similarity

torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Llama Index way
documents = SimpleDirectoryReader('../../documentation/english/').load_data();

unknown widths : 
[0, IndirectObject(1100, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1104, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1108, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1112, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1116, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1120, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1125, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1129, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1104, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1149, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1120, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1112, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1104, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1116, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1100, 0, 1880621156240)]
unknown widths : 
[0, IndirectObject(1155, 0, 1880621156240)]
unknown 

In [3]:
corpus = [document.text for document in documents]

In [4]:
# filter out new line
corpus = [sentence.replace('\n', '') for sentence in corpus]

In [5]:
text_splitter = CharacterTextSplitter(        
    separator = ".",
    chunk_size = 300,
    chunk_overlap  = 150,
    length_function = len,
)

In [6]:
texts = text_splitter.create_documents(corpus);

Created a chunk of size 375, which is longer than the specified 300
Created a chunk of size 316, which is longer than the specified 300
Created a chunk of size 319, which is longer than the specified 300
Created a chunk of size 551, which is longer than the specified 300
Created a chunk of size 406, which is longer than the specified 300
Created a chunk of size 827, which is longer than the specified 300
Created a chunk of size 339, which is longer than the specified 300
Created a chunk of size 318, which is longer than the specified 300
Created a chunk of size 311, which is longer than the specified 300
Created a chunk of size 324, which is longer than the specified 300
Created a chunk of size 311, which is longer than the specified 300
Created a chunk of size 359, which is longer than the specified 300
Created a chunk of size 337, which is longer than the specified 300
Created a chunk of size 366, which is longer than the specified 300
Created a chunk of size 337, which is longer tha

In [7]:
# preprocess texts
def clean_text(text):
    # remove \xad
    text = text.lower().strip()
    text = text.replace('\xad', '')
    text = text.replace('-', '')
    return text

In [8]:
texts = [text.page_content for text in texts if len(text.page_content) > 50]
texts = [clean_text(sentence) for sentence in texts]

In [11]:
keep_texts = [text for text in texts if 'marble' in text]
    

In [12]:
keep_texts

['the conservation of ancient marbledavid rinneconservator of antiquitiesthe j. paul getty museumprefacebeginning with this treatise on the conservation of ancient marble, the j. paul getty museum is pleased to initiate a series ofpublications dealing with the care of fine artworks',
 'subsequentvolumes will discuss the problems and treatment of other materials, such as bronze, wood, plaster, and terracotta.this publication deals with the conservation of artworksmade of marble',
 'this publication deals with the conservation of artworksmade of marble. a definition of the material will be followedby descriptions of common problems and their treatment,beginning with those which occur above the surface of thestone and proceeding inward',
 "paul getty museumisbn: 0892360038the conservation of ancient marblethe materialcherished by countless civilizations throughout history for itsbeauty and sculptural qualities, marble is actually a product ofseashells and other calciumbearing minerals dep

In [20]:
tokenizer = AutoTokenizer.from_pretrained("voidful/context-only-question-generator")
model = AutoModelForSeq2SeqLM.from_pretrained("voidful/context-only-question-generator")


In [21]:
model.to('cuda');

In [22]:
questions = []
context = []
total_texts = len(texts)
for i, text in enumerate(texts):
    # print progress
    print(f'Checked {i+1}/{total_texts}', end='\r')
    input_ids = tokenizer.encode(text, return_tensors="pt")
    input_ids = input_ids.to('cuda')
    outputs = model.generate(input_ids)
    questions.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    context.append(text)

Checked 1/3574



Checked 167/3574

: 

: 

In [120]:
df = pd.DataFrame({'question': questions, 'context': context})

In [121]:
df.to_csv('questions.csv', index=False)

In [13]:
tokenized_questions = [text.split(" ") for text in keep_texts]
bm25 = BM25Okapi(tokenized_questions)

In [135]:
model_name = "deepset/roberta-base-squad2"

# a) Get predictions
qa_nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [18]:
query = "How to prevent marble degratation from rain?"
tokenized_query = query.split(" ")
question_scores = bm25.get_scores(tokenized_query)
top_n_questions = bm25.get_top_n(tokenized_query, keep_texts, n=2)

In [19]:
top_n_questions

['marble in this state is often "sugary" to the touch and simplyfalls away of its own accord.18 weathering + general decompositionthe light are a shows  erosion  from internal  weakness cause d by water(the pile  of marble dust b y the foot has  fallen awa y from the statue)',
 'the calcarious deposits are either precipitated from ground waters or occur when calcium salts arewashed out of the body of the marble']

In [17]:
similar_questions = []
for i, question_score in enumerate(question_scores):
    if question_score > 10:
        similar_questions.append(questions[i])
        
if len(similar_questions) == 0:
    print("No similar questions found")
else:
    for question in similar_questions:
        context = df[df.question == question].context.values[0]
        print(f"Question: {question}")
        print(f"Answer: {context}\n")

No similar questions found


## Load already created csv

In [2]:
df = pd.read_csv('questions.csv')

In [3]:
old_length = len(df)

In [4]:
key_words = ['marble', 'stone', 'mortar', 'shells', 'shale', 'glass', 'sulfation', 'sulfuration', 'cracks', 'disintegration', 'crystallization', 'cavities', 'holes', 'infection']
# filter rows that context does not contain any word from key_words
df = df[df.question.str.contains('|'.join(key_words))]

In [5]:
new_length = len(df)
print(f"Filtered out {old_length - new_length} rows and left with {new_length} rows")

Filtered out 4004 rows and left with 402 rows


In [6]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [7]:
# Load model from HuggingFace Hub
similarity_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
similarity_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [8]:
qa_model_name = "deepset/roberta-base-squad2"
qa_nlp = pipeline('question-answering', model=qa_model_name, tokenizer=qa_model_name)

In [9]:
questions = df.question.to_list()

In [10]:
questions

['is calcium carbonate a component or component of marble?',
 'where can marble be found?',
 'is calcium sulphate the same as marble?',
 'is marble and alabaster the same thing?',
 'for all intents and purposes, marble isinsoluble in what?',
 'How are chlorides carried from the ground to the surface of thestone?',
 'Is marble strong or weak?',
 'why do most people not realize that marble is chemically reactive?',
 'how are marble stains transported to the surface?',
 'How do you remove the glue from a stone?',
 'Why are grouts and grouts harder than marble?',
 'How to clean a marble statue?',
 'what should be done evenly on a stone?',
 'what is the cleansequence of marble?',
 'how many poltices are needed to clean a stone?',
 'How many mechanical devices should be in a marble countertop?',
 'the vermont marble company produces several excellentcleaning products and what else?',
 'what is a goodconservation of marble?',
 'why do you add stains to a stone?',
 'How many stains are on the 

In [11]:
# Tokenize sentences
encoded_input = similarity_tokenizer(questions, padding=True, truncation=True, return_tensors='pt')

In [12]:
with torch.no_grad():
    model_output = similarity_model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = sentence_embeddings.detach().numpy()

In [13]:
query = "Which mortar is good?"
tokenized_query = similarity_tokenizer(query, padding=True, truncation=True, return_tensors='pt')

In [14]:
embedded_query = similarity_model(**tokenized_query)
question_embeddings = mean_pooling(embedded_query, tokenized_query['attention_mask'])

In [15]:
question_embeddings = question_embeddings.detach().numpy()

In [16]:
scores = cosine_similarity([question_embeddings[0]], sentence_embeddings)[0]

In [17]:
max_pos = np.argmax(scores[1:])
max_score = scores[max_pos+1]
similar_question = questions[max_pos+1]
context = df[df.question == similar_question].context.values[0]
QA_input = {
    'question': query,
    'context': context
}
qa_result = qa_nlp(QA_input)
print(f"Similar question found: {similar_question}")
print(f"Score: {max_score*100:.2f}%")
if max_score < 0.6:
    print("No answer found")
elif max_score < 0.85 and max_score > 0.6:
    print("Answer found but low confidence")
    print(f"Answer: {qa_result['answer']}")
else:
    print("Answer found with high confidence")
    print(f"Answer: {context}")

Similar question found: what type of mortar should be used for terracotta/faience?
Score: 76.99%
Answer found but low confidence
Answer: lime based mortar
