In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity




In [15]:


def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def split_text(text, chunk_size=250):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def create_vector_db(chunks):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)
    return tfidf_matrix, vectorizer

def find_most_relevant_chunk(query, tfidf_matrix, vectorizer):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    most_similar_idx = similarities.argmax()
    return most_similar_idx

def answer_question(question, context, model, tokenizer):
    inputs = tokenizer.encode_plus(
        question, 
        context, 
        add_special_tokens=True, 
        return_tensors="pt",
        max_length=512,  # Set maximum length
        truncation=True  # Enable truncation
    )
    
    input_ids = inputs["input_ids"].tolist()[0]
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    
    return answer



In [16]:


def main():
    print("Loading model and tokenizer... This may take a few minutes.")
    model_name = "deepset/bert-base-cased-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    file_path = 'output.txt'
    text = read_file(file_path)
    chunks = split_text(text)
    tfidf_matrix, vectorizer = create_vector_db(chunks)

    print("Model loaded. You can now ask questions.")

    while True:
        query = input("Ask a question about the text (or type 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        most_relevant_idx = find_most_relevant_chunk(query, tfidf_matrix, vectorizer)
        context = chunks[most_relevant_idx]

        answer = answer_question(query, context, model, tokenizer)
        print(f"Answer: {answer}\n")

if __name__ == "__main__":
    main()

Loading model and tokenizer... This may take a few minutes.


Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model loaded. You can now ask questions.
Answer: a functional definition of space

Answer: space is large fuel structure of the universe

Answer: large hill structure of the universe

