In [38]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [2]:
embedding = HuggingFaceEmbeddings(
    model_name="NeuML/pubmedbert-base-embeddings"
)

In [3]:
# Create semantic chunker
chunker = SemanticChunker(embedding, breakpoint_threshold_type="percentile")

In [4]:
text = """
Process models are mathematical formulations (essentially a set of equations) that
try to represent the real system/process in a digital or virtual form. These are
derived either based on fundamental physical laws often combined with empirical
assumptions or learned based on data. The former has been existing for several
decades in chemical and process engineering while the latter has recently
received a lot of attention with the emergence of several artificial intelligence/
machine learning techniques. Hybrid modeling is an emerging modeling paradigm
that explores the synergy between existing these two paradigms, taking advantage
of the existing process knowledge (or engineering know-how) and information
disseminated by the collected data. Such an approach is especially suitable for
systems and industries where data generation is significantly resource intensive
while at the same time fundamentally not completely deciphered such as the
processes involved in the biopharmaceutical pipeline. This technology could, in
fact, be the enabler to meeting the demands and goals of several initiatives such as
Quality by design, Process Analytical tools, and Pharma 4.0. In addition, it can aid in
different process applications throughout process development and Chemistry,
Manufacturing, and Control (CMC) to make it more strategic and efficient. This
article focuses on providing a step-by-step guide to the different considerations to
be made to develop a reliable and applicable hybrid model. In addition, the article
aims at highlighting the need for such tools in the biopharmaceutical industry and
summarizes the works that advocate its implications. Subsequently, the key
qualities of hybrid modeling that make it a key enabler in the
biopharmaceutical industry are elaborated with reference to the literature
demonstrating such qualities.
"""

In [10]:
docs = chunker.split_text(text)

  return forward_call(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [11]:
docs

['\nProcess models are mathematical formulations (essentially a set of equations) that\ntry to represent the real system/process in a digital or virtual form. These are\nderived either based on fundamental physical laws often combined with empirical\nassumptions or learned based on data.',
 'The former has been existing for several\ndecades in chemical and process engineering while the latter has recently\nreceived a lot of attention with the emergence of several artificial intelligence/\nmachine learning techniques. Hybrid modeling is an emerging modeling paradigm\nthat explores the synergy between existing these two paradigms, taking advantage\nof the existing process knowledge (or engineering know-how) and information\ndisseminated by the collected data. Such an approach is especially suitable for\nsystems and industries where data generation is significantly resource intensive\nwhile at the same time fundamentally not completely deciphered such as the\nprocesses involved in the bio

In [12]:
len(docs)

2

In [25]:
# Create semantic chunker
chunker = SemanticChunker(embedding, breakpoint_threshold_type="percentile", min_chunk_size=100, breakpoint_threshold_amount=80.0)
docs = chunker.split_text(text)
docs

['\nProcess models are mathematical formulations (essentially a set of equations) that\ntry to represent the real system/process in a digital or virtual form. These are\nderived either based on fundamental physical laws often combined with empirical\nassumptions or learned based on data.',
 'The former has been existing for several\ndecades in chemical and process engineering while the latter has recently\nreceived a lot of attention with the emergence of several artificial intelligence/\nmachine learning techniques. Hybrid modeling is an emerging modeling paradigm\nthat explores the synergy between existing these two paradigms, taking advantage\nof the existing process knowledge (or engineering know-how) and information\ndisseminated by the collected data. Such an approach is especially suitable for\nsystems and industries where data generation is significantly resource intensive\nwhile at the same time fundamentally not completely deciphered such as the\nprocesses involved in the bio

In [24]:
len(docs)

2

In [28]:
text_splitter =  RecursiveCharacterTextSplitter(
        chunk_size=900,
        chunk_overlap=100,
        length_function=len
    )

chunks = text_splitter.split_text(text)

In [29]:
chunks

['Process models are mathematical formulations (essentially a set of equations) that\ntry to represent the real system/process in a digital or virtual form. These are\nderived either based on fundamental physical laws often combined with empirical\nassumptions or learned based on data. The former has been existing for several\ndecades in chemical and process engineering while the latter has recently\nreceived a lot of attention with the emergence of several artificial intelligence/\nmachine learning techniques. Hybrid modeling is an emerging modeling paradigm\nthat explores the synergy between existing these two paradigms, taking advantage\nof the existing process knowledge (or engineering know-how) and information\ndisseminated by the collected data. Such an approach is especially suitable for\nsystems and industries where data generation is significantly resource intensive',
 'systems and industries where data generation is significantly resource intensive\nwhile at the same time fun

In [36]:
import time
import os
from langchain_community.document_loaders import PyPDFLoader

dataset_path = r"D:\Intelligent QA AI\data"
all_docs = []

for file in os.listdir(dataset_path):
    if file.endswith('.pdf'): 
        
        file_path = os.path.join(dataset_path, file)
        loader = PyPDFLoader(file_path, mode="single")
        docs = loader.load()
        
        all_docs.append(docs[0])
        
start = time.time()
chunker = SemanticChunker(embedding, breakpoint_threshold_type="percentile", min_chunk_size=100, 
                          breakpoint_threshold_amount=80.0)
docs = chunker.split_documents(all_docs)
end = time.time()

print(f"Time taken is {(end-start)/60} mins")

Time taken is 0.46490228970845543 mins


In [40]:
start = time.time()
vector_store = FAISS.from_documents(docs, embedding)
end = time.time()

print(f"Time taken is {(end-start)/60} mins")

Time taken is 0.052564597129821776 mins
