In [None]:
!pip install transformers
!pip install datasets
import torch

In [None]:
!unzip documents.zip

In [25]:
import os
DOC_FOLDER = '/content/content/documents'
documents = os.listdir(DOC_FOLDER)
docs = []
doc_names = []
for doc in documents:
  with open(os.path.join(DOC_FOLDER,doc), 'r') as file:
    data = file.read().replace('\n', ' ')
  docs.append(data)
  doc_names.append(doc)

In [18]:
def segment_documents(docs, max_doc_length=450):
  # List containing full and segmented docs
  segmented_docs = []

  for doc in docs:
    # Split document by spaces to obtain a word count that roughly approximates the token count
    split_to_words = doc.split(" ")

    # If the document is longer than our maximum length, split it up into smaller segments and add them to the list 
    if len(split_to_words) > max_doc_length:
      for doc_segment in range(0, len(split_to_words), max_doc_length):
        segmented_docs.append( " ".join(split_to_words[doc_segment:doc_segment + max_doc_length]))

    # If the document is shorter than our maximum length, add it to the list
    else:
      segmented_docs.append(doc)

  return segmented_docs

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_articles(query, docs, k=2):

  # Initialize a vectorizer that removes English stop words
  vectorizer = TfidfVectorizer(analyzer="word", stop_words='english')

  # Create a corpus of query and documents and convert to TFIDF vectors
  query_and_docs = [query] + docs
  matrix = vectorizer.fit_transform(query_and_docs)

  # Holds our cosine similarity scores
  scores = []

  # The first vector is our query text, so compute the similarity of our query against all document vectors
  for i in range(1, len(query_and_docs)):
    scores.append(cosine_similarity(matrix[0], matrix[i])[0][0])

  # Sort list of scores and return the top k highest scoring documents
  sorted_list = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
  top_doc_indices = [x[0] for x in sorted_list[:k]]
  top_docs = [docs[x] for x in top_doc_indices]
  
  return top_docs

In [20]:
from transformers import BertTokenizer, BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [21]:
def answer_question(question, answer_text):

    input_ids = tokenizer.encode(question, answer_text, max_length=512)
    
    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Answer: "' + answer + '"')

In [26]:
def get_slides(candidate_docs,doc_names,docs):
  result = []
  for cand_doc in candidate_docs:
    for j in range(0,len(docs)):
      if cand_doc == docs[j]:
        result.append((doc_names[j].split('_')[0],doc_names[j].split('_')[1].split('.')[0]))
  
  return result
    


In [24]:
# Enter our query here
query = "What is 2's complement?"
#query = "What else does the bassist for Death From Above play?"
#query = "What projects is Jesse Keeler involved in?"

# Segment our documents
segmented_docs = segment_documents(docs, 450)

# Retrieve the top k most relevant documents to the query
candidate_docs = get_top_k_articles(query, segmented_docs, 3)
# print(candidate_docs)
# # Return the likeliest answers from each of our top k most relevant documents in descending order
# for i in candidate_docs:
#   answer_question(query, i)
#   print ("Reference Document: ", i)

['2’s Complement is 1’s Complement Plus One!  Again for N=5: 11111 @*%-1) (answer) + 1 The first step is trivial: replace 0 with 1, and 1  with 0. The result ((2% — 1) — K) is called the 1’s complement of K.  Adding 1 more gives the 2’s complement.      \x0c', 'What about 2’s Complement?  How do we convert N-bit 2’s complement to (N+k)-bit 2’s complement (for k > 0)? For non-negative values,  °2’s complement is the same as unsigned (with an extra 0 for the sign) °So add k more leading 0s.  What about negative values?       \x0c', 'Distinguish 2’s Complement from Negation  Here or elsewhere, you will hear the phrase “take the 2’s complement.”  We will try not to use “2’s complement” in that way.  Students get confused between the  2’s complement representation for signed integers and the operation of negation on a bit pattern for a number represented with 2’s complement.  For clarity, we suggest that you do the same.         \x0c']
Answer: "1 ’ s complement plus one"
Reference Document:

In [27]:
op = get_slides(candidate_docs,doc_names,docs)
op

[('006', 'Slide13'), ('006', 'Slide21'), ('006', 'Slide14')]