In [1]:
import faiss
import numpy as np
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
document_paths = []
def preprocess_documents(directory):
    combined_text = []

    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if filepath.endswith(".txt"):
            with open(filepath, 'r') as file:
                combined_text.append(file.read())
                document_paths.append(filepath)
    return combined_text

directory = "./downloaded_pages/Block/Normal_Blocks"
documents = preprocess_documents(directory)

In [3]:
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
document_vectors_dense = document_vectors.toarray().astype('float32')

In [4]:
dim = document_vectors_dense.shape[1]

# Creating a FAISS index
index = faiss.IndexFlatL2(dim)
index.add(document_vectors_dense)

In [5]:
def search_in_index(query, k=5):
    query_vector = vectorizer.transform([query]).toarray().astype('float32')
    distances, indices = index.search(query_vector, k)
    
    results = []
    for idx in indices[0]: 
        document_title_or_excerpt = get_document_title_or_excerpt(idx)
        results.append((idx, document_title_or_excerpt))
    
    return results

def get_document_title_or_excerpt(index):
    return f"Document at index: {index}"


In [6]:
query = "hunger"
search_results = search_in_index(query)

for result in search_results:
    print(f"Document Index: {result[0]}, Document Path: {document_paths[result[0]]}")

Document Index: 50, Document Path: ./downloaded_pages/Block/Normal_Blocks\Cake.txt
Document Index: 42, Document Path: ./downloaded_pages/Block/Normal_Blocks\Blue_Orchid.txt
Document Index: 103, Document Path: ./downloaded_pages/Block/Normal_Blocks\Dandelion.txt
Document Index: 56, Document Path: ./downloaded_pages/Block/Normal_Blocks\Carrot.txt
Document Index: 146, Document Path: ./downloaded_pages/Block/Normal_Blocks\Glow_Berries.txt


In [7]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [8]:
def read_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()
    
def split_into_chunks(document, max_length=500):
    tokenized_text = tokenizer.tokenize(document)

    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokenized_text:
        current_chunk.append(token)
        current_length += 1

        if current_length == max_length:
            chunks.append(" ".join(current_chunk))
            print(len(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


def answer_question(question, answer_text):
    input_ids = tokenizer.encode(question, answer_text)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    input_ids = torch.tensor([input_ids])

    with torch.no_grad():
        outputs = model(input_ids)
        answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end])
    print(answer)


    return answer

In [9]:
'''
def get_best_answer(question, document_paths):
    best_answer = ""
    highest_score = -float('inf')

    for path in document_paths:
        document = read_document(path)
        chunks = split_into_chunks(document)

        for chunk in chunks:
            answer = answer_question(question, chunk)
            # TODO: Add logic to evaluate confidence

            print(answer)

    return best_answer
'''

In [19]:
'''
question = "How can a wet sponge be turned back into a normal sponge?"
search_results = search_in_index(question)
top_document_paths = []

for result in search_results:
    top_document_paths.append(document_paths[result[0]])
    print(document_paths[result[0]])

answer = get_best_answer(question, top_document_paths)
print("Answer:", answer)
'''

'\nquestion = "How can a wet sponge be turned back into a normal sponge?"\nsearch_results = search_in_index(question)\ntop_document_paths = []\n\nfor result in search_results:\n    top_document_paths.append(document_paths[result[0]])\n    print(document_paths[result[0]])\n\nanswer = get_best_answer(question, top_document_paths)\nprint("Answer:", answer)\n'

In [13]:
from collections import OrderedDict
class DocumentReader:
    def __init__(self):
        #self.READER_PATH = pretrained_model_name_or_path
        self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        self.model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        self.max_len = self.model.config.max_position_embeddings
        self.chunked = False

    def tokenize(self, question, text):
        self.inputs = self.tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
        self.input_ids = self.inputs["input_ids"].tolist()[0]

        if len(self.input_ids) > self.max_len:
            self.inputs = self.chunkify()
            self.chunked = True

    def chunkify(self):
        qmask = self.inputs['token_type_ids'].lt(1)
        qt = torch.masked_select(self.inputs['input_ids'], qmask)
        chunk_size = self.max_len - qt.size()[0] - 1

        chunked_input = OrderedDict()
        for k,v in self.inputs.items():
            q = torch.masked_select(v, qmask)
            c = torch.masked_select(v, ~qmask)
            chunks = torch.split(c, chunk_size)
            
            for i, chunk in enumerate(chunks):
                if i not in chunked_input:
                    chunked_input[i] = {}

                thing = torch.cat((q, chunk))
                if i != len(chunks)-1:
                    if k == 'input_ids':
                        thing = torch.cat((thing, torch.tensor([102])))
                    else:
                        thing = torch.cat((thing, torch.tensor([1])))

                chunked_input[i][k] = torch.unsqueeze(thing, dim=0)
        return chunked_input
 
    def get_answer(self):
        if self.chunked:
            answer = ''
            for k, chunk in self.inputs.items():
                outputs = self.model(**chunk)
                answer_start_scores = outputs.start_logits
                answer_end_scores = outputs.end_logits
                answer_start = torch.argmax(answer_start_scores)
                
                answer_end = torch.argmax(answer_end_scores) + 1

                ans = self.convert_ids_to_string(chunk['input_ids'][0][answer_start:answer_end])
                if ans != '[CLS]':
                    answer += ans + " "
            return answer
        else:
            outputs = self.model(**self.inputs)
            answer_start_scores = outputs.start_logits
            answer_end_scores = outputs.end_logits

            answer_start = torch.argmax(answer_start_scores)
            answer_end = torch.argmax(answer_end_scores) + 1
        
            return self.convert_ids_to_string(self.inputs['input_ids'][0][
                                              answer_start:answer_end])

    def convert_ids_to_string(self, input_ids):
        return self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids))

In [14]:
reader = DocumentReader()
question = "How can players obtain budding amethyst blocks in the game?"
search_results = search_in_index(question)
top_document_paths = []

for result in search_results:
    top_document_paths.append(document_paths[result[0]])
    print(document_paths[result[0]])

document = read_document(top_document_paths[0])
reader.tokenize(question, document)
print(f"Answer: {reader.get_answer()}")

./downloaded_pages/Block/Normal_Blocks\Budding_Amethyst.txt
./downloaded_pages/Block/Normal_Blocks\Amethyst_Cluster.txt
./downloaded_pages/Block/Normal_Blocks\Block_of_Amethyst.txt
./downloaded_pages/Block/Normal_Blocks\Tinted_Glass.txt
./downloaded_pages/Block/Normal_Blocks\Calcite.txt
Answer: via the creative inventory or with commands


In [15]:
def ask_question(question):
    reader = DocumentReader()
    search_results = search_in_index(question)
    top_document_paths = []
    for result in search_results:
        top_document_paths.append(document_paths[result[0]])
        #print(document_paths[result[0]])
    document = read_document(top_document_paths[0])
    reader.tokenize(question, document)
    return reader.get_answer()

In [17]:
import json
import pandas as pd

with open('./part_scraped_final_minecraft_dataset(1).json', 'r') as file:
    data = json.load(file)

qa_data = []
for entry in data['data']:
    title = entry['title']
    for paragraph in entry['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            id = qa['id']
            for answer in qa['answers']:
                answer_text = answer['text']
                answer_start = answer['answer_start']
                qa_data.append({'title': title, 'context': context, 'question': question, 'id': id, 'answer_text': answer_text, 'answer_start': answer_start})

df = pd.DataFrame(qa_data)

In [18]:
df

Unnamed: 0,title,context,question,id,answer_text,answer_start
0,Sponge,Overview\n A sponge is a block that can be ...,How can a wet sponge be turned back into a nor...,q1,"A wet sponge can be dried in a furnace, making...",1067
1,Sponge,Overview\n A sponge is a block that can be ...,Where can sponges be found in Minecraft?,q2,Sponges can only be found in ocean monuments.,250
2,Sponge,Overview\n A sponge is a block that can be ...,What is the maximum distance a sponge can abso...,q3,A sponge absorbs both flowing and source block...,1935
3,Leaves,Overview\n Leaves are natural blocks that g...,What are the default tools for breaking leaves...,q1,Hoes are the default tools for breaking leaves.,114
4,Leaves,Overview\n Leaves are natural blocks that g...,Which blocks are affected by the biome dyeing ...,q2,Leaves are affected by the biome dyeing algori...,5351
...,...,...,...,...,...,...
1082,End_Portal_(block),Overview\n The end portal block is a block ...,How can end portal blocks be placed in Java Ed...,q2,It can be placed only by using block placement...,270
1083,End_Portal_(block),Overview\n The end portal block is a block ...,What happens when entities travel to the Overw...,q3,Players traveling to the Overworld appear at t...,829
1084,Bubble_Column,Overview\n A bubble column is a non-solid b...,What are the two ways in which a bubble column...,q1,A bubble column is a non-solid block generated...,31
1085,Bubble_Column,Overview\n A bubble column is a non-solid b...,How can players and mobs make use of bubble co...,q2,Players and air-breathing mobs can enter a bub...,1180


In [19]:
from bert_score import score
questions = df['question'].head(50).to_list()
contexts = df['context'].head(50).to_list()
candidates = []
for i in range (50):
    question = questions[i]
    context = contexts[i]
    predicted_answer = ask_question(question)
    # print("Q:", question)
    # print("A:", predicted_answer)
    candidates.append(predicted_answer)

references = df['answer_text'].head(50).to_list()
P, R, F1 = score(candidates, references, lang="en")

Token indices sequence length is longer than the specified maximum sequence length for this model (605 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2190 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (606 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2024 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2195 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

In [20]:
print(P.mean())
print(R.mean())
print(F1.mean())

tensor(0.8413)
tensor(0.8727)
tensor(0.8406)


Using bert-base

In [21]:
from collections import OrderedDict
class DocumentReader:
    def __init__(self):
        #self.READER_PATH = pretrained_model_name_or_path
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
        self.max_len = self.model.config.max_position_embeddings
        self.chunked = False

    def tokenize(self, question, text):
        self.inputs = self.tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
        self.input_ids = self.inputs["input_ids"].tolist()[0]
        if len(self.input_ids) > self.max_len:
            self.inputs = self.chunkify()
            self.chunked = True

    def chunkify(self):
        qmask = self.inputs['token_type_ids'].lt(1)
        qt = torch.masked_select(self.inputs['input_ids'], qmask)
        chunk_size = self.max_len - qt.size()[0] - 1

        chunked_input = OrderedDict()
        for k,v in self.inputs.items():
            q = torch.masked_select(v, qmask)
            c = torch.masked_select(v, ~qmask)
            chunks = torch.split(c, chunk_size)
            
            for i, chunk in enumerate(chunks):
                if i not in chunked_input:
                    chunked_input[i] = {}

                thing = torch.cat((q, chunk))
                if i != len(chunks)-1:
                    if k == 'input_ids':
                        thing = torch.cat((thing, torch.tensor([102])))
                    else:
                        thing = torch.cat((thing, torch.tensor([1])))

                chunked_input[i][k] = torch.unsqueeze(thing, dim=0)
        return chunked_input
 
    def get_answer(self):
        if self.chunked:
            answer = ''
            for k, chunk in self.inputs.items():
                outputs = self.model(**chunk)
                answer_start_scores = outputs.start_logits
                answer_end_scores = outputs.end_logits
                answer_start = torch.argmax(answer_start_scores)
                
                answer_end = torch.argmax(answer_end_scores) + 1

                ans = self.convert_ids_to_string(chunk['input_ids'][0][answer_start:answer_end])
                if ans != '[CLS]':
                    answer += ans + " "
            return answer
        else:
            outputs = self.model(**self.inputs)
            answer_start_scores = outputs.start_logits
            answer_end_scores = outputs.end_logits

            answer_start = torch.argmax(answer_start_scores)
            answer_end = torch.argmax(answer_end_scores) + 1
        
            return self.convert_ids_to_string(self.inputs['input_ids'][0][
                                              answer_start:answer_end])

    def convert_ids_to_string(self, input_ids):
        return self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids))

In [29]:
reader = DocumentReader()
question = "How can players obtain budding amethyst blocks in the game?"
search_results = search_in_index(question)
top_document_paths = []

for result in search_results:
    top_document_paths.append(document_paths[result[0]])
    print(document_paths[result[0]])

document = read_document(top_document_paths[0])
reader.tokenize(question, document)
print(f"Answer: {reader.get_answer()}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

./downloaded_pages/Block/Normal_Blocks\Budding_Amethyst.txt
./downloaded_pages/Block/Normal_Blocks\Amethyst_Cluster.txt
./downloaded_pages/Block/Normal_Blocks\Block_of_Amethyst.txt
./downloaded_pages/Block/Normal_Blocks\Tinted_Glass.txt
./downloaded_pages/Block/Normal_Blocks\Calcite.txt
Answer: 


In [41]:
def ask_question(question):
    reader = DocumentReader()
    search_results = search_in_index(question)
    top_document_paths = []
    for result in search_results:
        top_document_paths.append(document_paths[result[0]])
        #print(document_paths[result[0]])
    document = read_document(top_document_paths[0])
    reader.tokenize(question, document)
    return reader.get_answer()

In [42]:
import json
import pandas as pd

with open('./part_scraped_final_minecraft_dataset(1).json', 'r') as file:
    data = json.load(file)

qa_data = []
for entry in data['data']:
    title = entry['title']
    for paragraph in entry['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            id = qa['id']
            for answer in qa['answers']:
                answer_text = answer['text']
                answer_start = answer['answer_start']
                qa_data.append({'title': title, 'context': context, 'question': question, 'id': id, 'answer_text': answer_text, 'answer_start': answer_start})

df = pd.DataFrame(qa_data)

In [None]:
from bert_score import score
questions = df['question'].head(50).to_list()
contexts = df['context'].head(50).to_list()
candidates = []
for i in range (50):
    question = questions[i]
    context = contexts[i]
    predicted_answer = ask_question(question)
    # print("Q:", question)
    # print("A:", predicted_answer)
    candidates.append(predicted_answer)

references = df['answer_text'].head(50).to_list()
P, R, F1 = score(candidates, references, lang="en")

In [44]:
print(P.mean())
print(R.mean())
print(F1.mean())

tensor(0.5359)
tensor(0.8355)
tensor(0.5642)
