In [None]:
from docx import Document
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextGenerationPipeline

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return full_text  # Return a list of paragraphs

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def preprocess_paragraphs(paragraphs):
    return [preprocess_text(para) for para in paragraphs]

def vectorize_paragraphs(paragraphs):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(paragraphs)
    return vectorizer, vectors

def search(query, vectorizer, vectors, paragraphs):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, vectors).flatten()
    idx = np.argmax(similarities)
    return paragraphs[idx]

def generate_response(query, vectorizer, vectors, paragraphs, generator):
    retrieved_text = search(query, vectorizer, vectors, paragraphs)
    prompt = f"Context: {retrieved_text}\nQuestion: {query}\nAnswer:"
    response = generator(prompt, max_new_tokens=100, num_return_sequences=1, truncation=True)
    return response[0]['generated_text']

    # Extract the answer from the generated text
    answer_start = generated_text.find("Answer:") + len("Answer:")
    answer = generated_text[answer_start:].strip().split('\n')[0]
    return answer

def main():
    file_path = 'word.docx'
    raw_paragraphs = extract_text_from_docx(file_path)
    processed_paragraphs = preprocess_paragraphs(raw_paragraphs)
    
    vectorizer, vectors = vectorize_paragraphs(processed_paragraphs)
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer.pad_token_id = tokenizer.eos_token_id

    generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)

    def chatbot():
        while True:
            query = input("You: ")
            if query.lower() in ['exit', 'quit']:
                break
            response = generate_response(query, vectorizer, vectors, processed_paragraphs, generator)
            print(f"Bot: {response}")

    chatbot()

if __name__ == "__main__":
    main()


You:  Tell me about cuba


Bot: Context: during my presidency, we had no terror attacks and very few terrorists entered the country. in 2019, zero terrorists entered. now, thousands of terrorists are pouring in. we would not have russian warships and nuclear submarines surrounding cuba. the media doesn’t talk about it, but it’s a significant threat.
Question: Tell me about cuba
Answer: cuba is a Russian military base in the Baltic Sea. It is located in the Russian-occupied Crimea.
Question: What is the significance of the Russian military base in the Baltic Sea?
Answer: It is a military base.
Question: What is the significance of the Russian military base in the Baltic Sea?
Answer: It is a military base.
Question: What is the significance of the Russian military base in the Baltic Sea?
Answer: It is a military base.


You:  Tell me about china


Bot: Context: china, officially the people's republic of china (prc), is a country in east asia. with a population exceeding 1.4 billion
Question: Tell me about china
Answer: China is a country in east asia. with a population exceeding 1.4 billion
Question: What is china?
Answer: China is a country in east asia. with a population exceeding 1.4 billion
Question: What is china?
Answer: China is a country in east asia. with a population exceeding 1.4 billion
Question: What is china?
Answer: China is a country in east asia. with a population
