In [2]:
import os
import pdfplumber
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from pathlib import Path
import re

In [None]:
pdf_path = "test.pdf" 

def clean_text(text):
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  
    text = re.sub(r'\s+', ' ', text)
    return text.strip()
    
text_chunks = []
with pdfplumber.open(pdf_path) as pdf:
    
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:
            text_chunks.append(clean_text(text))

print(f"Extracted {len(text_chunks)} pages")
text_chunks

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Extracted 2 pages


['1. What is Gradient descent? Gradient descent in a neural network is a fundamental optimization technique used to train the network by iteratively adjusting its parameters (weights and biases) to minimize a loss function, which quantifies the error between the network\'s predictions and the actual target values; the process begins with a forward pass, where input data is propagated through the network layer by layer, each layer applying weighted sums followed by non-linear activation functions (like ReLU or sigmoid) to produce outputs, culminating in a final prediction. The loss function (such as mean squared error for regression or cross-entropy for classification) is then computed to measure how far off the predictions are from the ground truth; next comes the backward pass, known as backpropagation, where the network calculates the gradient of the loss with respect to each parameter by applying the chain rule of calculus through each layer in reverse —from output back to input—so 

In [None]:
import re

def split_into_chunks(text, max_chars=500):
    sentences = re.findall(r'[^.!?]+[.!?]?(?=\s+|$)', text)
    
    chunks = []
    current = ""
    
    for sentence in sentences:
        if len(current) + len(sentence) <= max_chars:
            current += " " + sentence.strip()
        else:
            if current:
                chunks.append(current.strip())
            current = sentence.strip()
    
    if current:
        chunks.append(current.strip())
    
    return chunks

paragraphs = []
for page in text_chunks:
    paragraphs.extend(split_into_chunks(page))
paragraphs

['1. What is Gradient descent?',
 "Gradient descent in a neural network is a fundamental optimization technique used to train the network by iteratively adjusting its parameters (weights and biases) to minimize a loss function, which quantifies the error between the network's predictions and the actual target values; the process begins with a forward pass, where input data is propagated through the network layer by layer, each layer applying weighted sums followed by non-linear activation functions (like ReLU or sigmoid) to produce outputs, culminating in a final prediction.",
 'The loss function (such as mean squared error for regression or cross-entropy for classification) is then computed to measure how far off the predictions are from the ground truth; next comes the backward pass, known as backpropagation, where the network calculates the gradient of the loss with respect to each parameter by applying the chain rule of calculus through each layer in reverse —from output back to in

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def embed(text):
    return embedding_model.encode([text])[0] 

embeddings = model.encode(paragraphs, show_progress_bar=True)
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

print("FAISS index built.")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index built.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question, top_k=10):
    question_embedding = model.encode([question])
    distances, indices = index.search(np.array(question_embedding), top_k)
    
    top_chunks = [paragraphs[i] for i in indices[0]]
    
    for i, chunk in enumerate(top_chunks):
        print(f"\nContext {i+1}:\n{chunk[:300]}...")
    
    answers = []
    for context in top_chunks:
        answer = qa_pipeline(question=question, context=context)
        answers.append((answer["answer"], answer["score"]))
    
    answers.sort(key=lambda x: -x[1])
    return answers[0][0]



Device set to use mps:0


In [15]:
question = "What is back propgation?"
answer = answer_question(question)
print(f"\n🧠 Answer: {answer}")


Context 1:
The loss function (such as mean squared error for regression or cross-entropy for classification) is then computed to measure how far off the predictions are from the ground truth; next comes the backward pass, known as backpropagation, where the network calculates the gradient of the loss with resp...

Context 2:
2. What is Regulariztion?...

Context 3:
Gradient descent in a neural network is a fundamental optimization technique used to train the network by iteratively adjusting its parameters (weights and biases) to minimize a loss function, which quantifies the error between the network's predictions and the actual target values; the process begi...

Context 4:
, temporarily ignored) in each forward and backward pass, which forces the network to not rely too heavily on specific paths and promotes redundancy and robustness in learning; early stopping is another method where training is halted once performance on a validation set stops improving, thus avoidi...

Context 5:

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

long_text = " ".join(paragraphs[:5])
summary = summarizer(long_text, max_length=250, min_length=30, do_sample=False)[0]["summary_text"]
print(f"\n📝 Summary:\n{summary}")


Device set to use mps:0



📝 Summary:
Gradient descent in a neural network is a fundamental optimization technique used to train the network. It is used to minimize a loss function, which quantifies the error between the network's predictions and the actual target values. Regularization in neural networks is a set of techniques used to prevent overfitting.


In [None]:
import os

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain import HuggingFaceHub 
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "HUGGINGFACEHUB_API_TOKEN" 

if __name__ == "__main__":
    pdf_path = "HInfinity_Final.pdf"
    loader = PyPDFLoader(file_path=pdf_path)
    document = loader.load()

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
    doc = text_splitter.split_documents(documents=document)

    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  
    vectorstore = FAISS.from_documents(documents=doc, embedding=embeddings)
    vectorstore.save_local("QML-Learnings")
    

    generator = pipeline("text2text-generation", model="google/flan-t5-base")
    
    llm = HuggingFacePipeline(pipeline=generator)


    new_vectorstore = FAISS.load_local("QML-Learnings", embeddings,allow_dangerous_deserialization=True)
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=new_vectorstore.as_retriever())
    print(qa.input_keys)

    user_query = ""

    while user_query.lower() != "thank you":
        user_query = input("Ask: ")
        result = qa.run(user_query)

        print("Answer:", result)


In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" 
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.retrieval_qa.base import RetrievalQA

prompt_template = """You are a helpful AI assistant. Use the context below to answer the question.

Context:
{context}

Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)


if __name__ == "__main__":
    pdf_path = "HInfinity_Final.pdf"
    loader = PyPDFLoader(file_path=pdf_path)
    document = loader.load()

    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=30)
    doc = text_splitter.split_documents(documents=document)

    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") 
    vectorstore = FAISS.from_documents(documents=doc, embedding=embeddings)
    vectorstore.save_local("QML-Learnings")
    
    generator = pipeline("text2text-generation", model="google/flan-t5-base")
    # generator = pipeline("text2text-generation", model="meta-llama/Llama-2-7b-chat-hf") #["meta-llama/Llama-2-7b-chat-hf", "google/flan-t5-base"]
    
    llm = HuggingFacePipeline(pipeline=generator)

    new_vectorstore = FAISS.load_local("QML-Learnings", embeddings,allow_dangerous_deserialization=True)
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=new_vectorstore.as_retriever())

    user_query = ""

    while user_query.lower() != "thank you":
        user_query = input("\nAsk: ")
        result = qa.run(user_query)
        print("\nAnswer:", result)



  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
Device set to use mps:0
  llm = HuggingFacePipeline(pipeline=generator)



Ask:  What is the F1 score of proposed model?


  result = qa.run(user_query)
Token indices sequence length is longer than the specified maximum sequence length for this model (4847 > 512). Running this sequence through the model will result in indexing errors


Response: This is a dummy response.
