# Try GPT-3 Solution

In [1]:
import os
import torch
import pandas as pd
import numpy as np

from rank_bm25 import BM25Okapi
from langchain.text_splitter import CharacterTextSplitter
from llama_index import SimpleDirectoryReader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, DPRQuestionEncoder, DPRContextEncoder

torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Llama Index way
documents = SimpleDirectoryReader('../../documentation/english/').load_data();

unknown widths : 
[0, IndirectObject(1100, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1104, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1108, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1112, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1116, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1120, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1125, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1129, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1104, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1149, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1120, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1112, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1104, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1116, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1100, 0, 2186530953344)]
unknown widths : 
[0, IndirectObject(1155, 0, 2186530953344)]
unknown 

In [4]:
corpus = [document.text for document in documents]

In [5]:
# filter out new line
corpus = [sentence.replace('\n', '') for sentence in corpus]

In [6]:
text_splitter = CharacterTextSplitter(        
    separator = ".",
    chunk_size = 200,
    chunk_overlap  = 50,
    length_function = len,
)

In [7]:
texts = text_splitter.create_documents(corpus);

Created a chunk of size 375, which is longer than the specified 200
Created a chunk of size 316, which is longer than the specified 200
Created a chunk of size 212, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 285, which is longer than the specified 200
Created a chunk of size 284, which is longer than the specified 200
Created a chunk of size 218, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 234, which is longer than the specified 200
Created a chunk of size 212, which is longer than the specified 200
Created a chunk of size 232, which is longer than the specified 200
Created a chunk of size 262, which is longer than the specified 200
Created a chunk of size 214, which is longer than the specified 200
Created a chunk of size 255, which is longer tha

In [8]:
# preprocess texts
def clean_text(text):
    # remove \xad
    text = text.lower().strip()
    text = text.replace('\xad', '')
    text = text.replace('-', '')
    return text

In [9]:
texts = [text.page_content for text in texts if len(text.page_content) > 50]
texts = [clean_text(sentence) for sentence in texts]

In [10]:
tokenizer = AutoTokenizer.from_pretrained("voidful/context-only-question-generator")
model = AutoModelForSeq2SeqLM.from_pretrained("voidful/context-only-question-generator")

In [11]:
model.to('cuda');

In [12]:
questions = []
context = []
total_texts = len(texts)
for i, text in enumerate(texts):
    # print progress
    print(f'Checked {i+1}/{total_texts}', end='\r')
    input_ids = tokenizer.encode(text, return_tensors="pt")
    input_ids = input_ids.to('cuda')
    outputs = model.generate(input_ids)
    questions.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    context.append(text)

Checked 1/4140



Checked 169/4140

KeyboardInterrupt: 

In [120]:
df = pd.DataFrame({'question': questions, 'context': context})

In [121]:
df.to_csv('questions.csv', index=False)

In [122]:
tokenized_questions = [question.split(" ") for question in questions]
bm25 = BM25Okapi(tokenized_questions)

In [135]:
model_name = "deepset/roberta-base-squad2"

# a) Get predictions
qa_nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [216]:
query = "How to maintain stone with cracks?"
tokenized_query = query.split(" ")
question_scores = bm25.get_scores(tokenized_query)
top_n_questions = bm25.get_top_n(tokenized_query, questions, n=10)

In [217]:
similar_questions = []
for i, question_score in enumerate(question_scores):
    if question_score > 10:
        similar_questions.append(questions[i])
        
if len(similar_questions) == 0:
    print("No similar questions found")
else:
    for question in similar_questions:
        context = df[df.question == question].context.values[0]
        print(f"Question: {question}")
        print(f"Answer: {context}\n")

Question: can dense pointing mortar cause increased crystallisation damage?
Answer: –   dense pointing mortar can also cause increased crystallisation damage by restricting the movement of water through it so that little evaporation  can take place at the joints

Question: Is cement mortar breathable or breathable?
Answer: –  cement mortar is nonbreathable and removes the ability of the wall to allow water to escape.–  when repointing the same mortar should be used that was originally used (i.e., not what was replaced)

Question: the dense mortar of clay is subjected to what?
Answer: this leads to problems when it freezes; the dense mortar tolerates the expansion pressure, but the clay is subjected to spalling



## Load already created csv

In [2]:
df = pd.read_csv('questions.csv')

In [3]:
df.head()

Unnamed: 0,question,context
0,the j paul getty museumpreface is a treatise o...,the conservation of ancient marbledavid rinnec...
1,the paul getty museum is pleased to initiate a...,paul getty museum is pleased to initiate a ser...
2,What is the subject of the second volume of th...,subsequentvolumes will discuss the problems an...
3,What is the first step in a definition of the ...,a definition of the material will be followedb...
4,the final appearance of the sculpture is calle...,the last section will beconcerned with element...


In [4]:
ctx_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_model.to('cuda');

qst_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
qst_model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
qst_model.to('cuda');

In [5]:
torch.cuda.empty_cache()

In [6]:
question_embeddings = []
context_embeddings = []
for i in range(len(df)):
    # print progress
    print(f'Checked {i+1}/{len(df)}', end='\r')
    
    question_ids = qst_tokenizer(df.question[i], return_tensors="pt")['input_ids'].to('cuda')
    # question_ids = qst_tokenizer(df.question[i], return_tensors="pt")['input_ids']
    question_embeddings.append(qst_model(question_ids).pooler_output)
    del question_ids
    
    context_ids = ctx_tokenizer(df.context[i], return_tensors="pt")['input_ids'].to('cuda')
    # context_ids = ctx_tokenizer(df.context[i], return_tensors="pt")['input_ids']
    context_embeddings.append(ctx_model(context_ids).pooler_output)
    del context_ids
    
    torch.cuda.empty_cache()

Checked 349/4406

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 12.00 GiB total capacity; 11.32 GiB already allocated; 0 bytes free; 11.32 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF