In [None]:
pip install torch torchvision torchaudio

In [None]:
pip install sentence-transformers

In [None]:
pip install transformers

In [None]:
pip install faiss-gpu

In [None]:
pip install langchain

In [None]:
pip install -U langchain-community

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer

In [None]:
pip install pymupdf

In [None]:
import os

import fitz
from langchain.docstore.document import Document

pdf_dir = "./nlp/PythonDocs"

# List to store all documents
documents = []

# Load and preprocess all PDFs
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"Processing: {pdf_path}") 

        try:
            with fitz.open(pdf_path) as doc:
                for page_num in range(doc.page_count):
                    page = doc.load_page(page_num)
                    text = page.get_text("text")
                    if text.strip():  # Only add non-empty text
                        chunks = text.split("\n\n")  # Split based on paragraphs
                        for chunk in chunks:
                            document = Document(page_content=chunk.strip()) 
                            documents.append(document)  

        except Exception as e:
            print(f"Error loading {pdf_path}: {e}")

print(f"Loaded {len(documents)} documents.")


In [None]:
# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(model_kwargs={"device": "cpu"})

# Create FAISS index from documents
vectorstore = FAISS.from_documents(documents, embeddings)

# Save the FAISS index locally
vectorstore.save_local("faiss_index")
print("FAISS index created and saved!")

In [None]:
# Load the FAISS index
vectorstore = FAISS.load_local(
    "faiss_index",
    embeddings,
    allow_dangerous_deserialization=True
)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:

def load_questions_and_answers(filename):
    questions_and_answers = {}
    
    with open(filename, 'r') as file:
        lines = file.readlines()
        
    i = 0
    while i < len(lines):
        question = lines[i].strip()
        i += 1
        
        answer_lines = []
        while i < len(lines) and lines[i].strip() != "":
            answer_lines.append(lines[i].strip())
            i += 1
        
        answer = "\n".join(answer_lines)
        
        if question and answer:
            questions_and_answers[question] = answer

        while i < len(lines) and lines[i].strip() == "":
            i += 1

    return questions_and_answers

In [23]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer


model_name = "deepset/roberta-base-squad2-distilled"

#model_name = "deepset/bert-base-cased-squad2"

#model_name="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Define QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=0
)

# Initialize HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=qa_pipeline)


Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# Function to split input into chunks
def split_input(text, max_length):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

In [25]:
import time

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# similarity
similarity_model = SentenceTransformer('all-MiniLM-L6-v2',device='cpu')


def evaluate_va(filename):
    # Load questions and answers from the text file
    correct_answers = load_questions_and_answers(filename)

    results = {
        'question': [],
        'context': [],
        'answer': [],
        'time': [],
        'correct': [],
        'similarity_score': []
    }

    for question, expected_answer in correct_answers.items():
        
        results['question'].append(question)

        start_time = time.time()

        retriever_result = retriever.get_relevant_documents(question, top_k=3)

        context = " ".join([doc.page_content for doc in retriever_result])

        results['context'].append(context)

        if not question.strip() or not context.strip():
            print(f"Warning: Empty question or context for input.")
            continue

        response = ""

        response = qa_pipeline({
            "question": question,
            "context": context
        })

        elapsed_time = time.time() - start_time
        
        # Append results
        results['answer'].append(response['answer'])
        results['time'].append(elapsed_time)
        
        # Calculate similarity score
        response_embedding = similarity_model.encode(response['answer'], convert_to_tensor=True)
        expected_embedding = similarity_model.encode(expected_answer, convert_to_tensor=True)

        similarity_score = cosine_similarity(
            response_embedding.unsqueeze(0).cpu().numpy(),
            expected_embedding.unsqueeze(0).cpu().numpy()
        )[0][0]

        results['similarity_score'].append(similarity_score)

        is_correct = 1 if similarity_score > 0.4 else 0
        results['correct'].append(is_correct)

    return results

In [26]:
filename = './nlp/questions_and_answers.txt'

evaluation_results = evaluate_va(filename)

gpt_accuracy = sum(evaluation_results['correct']) / len(evaluation_results['question']) * 100

print(f"Accuracy: {gpt_accuracy:.2f}%")

# Calculate the average time taken
average_time = sum(evaluation_results['time']) / len(evaluation_results['time'])

# Print the average time
print(f"Average Time Taken: {average_time:.2f} seconds")


print("\nDetailed Results:")
for i in range(len(evaluation_results['question'])):
    print("======================================Question==========================================")
    print(f"{evaluation_results['question'][i]:<30}")
    # print("-------------------------------------Context--------------------------------------------")
    # print(f"{evaluation_results['context'][i]:<50}")
    print("-------------------------------------Answer--------------------------------------------")
    print(f"{evaluation_results['answer'][i]:<50}")
    print("-------------------------------------Evaluation----------------------------------------") 
    print(f"{evaluation_results['time'][i]:<15.2f} {evaluation_results['similarity_score'][i]:<15}")




Accuracy: 33.33%
Average Time Taken: 0.32 seconds

Detailed Results:
What is a list in Python and how are lists defined?
-------------------------------------Answer--------------------------------------------
Variables and Data Types                          
-------------------------------------Evaluation----------------------------------------
0.34            0.2597041428089142
How do you define a function in Python?
-------------------------------------Answer--------------------------------------------
def keyword                                       
-------------------------------------Evaluation----------------------------------------
0.32            0.4638255834579468
What is a dictionary in Python and how is it defined?
-------------------------------------Answer--------------------------------------------
an unordered collection of key-value pairs        
-------------------------------------Evaluation----------------------------------------
0.36            0.3907823562622070