In [19]:
"""
--------------------------------------
Importing required classes/methods
--------------------------------------
"""

import os
from dotenv import load_dotenv
from langchain.document_loaders import PDFPlumberLoader
import pdfplumber
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import AzureChatOpenAI, AzureOpenAI, AzureOpenAIEmbeddings
import faiss
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from pathlib import Path
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain

import re
from langchain.prompts import PromptTemplate


In [20]:
"""
--------------------------------------
Document Transformation
--------------------------------------
"""
def document_transformation_pdf(file_path):
    """
    #To load sing pdf file
    # Step 1: Open the PDF using pdfplumber to extract metadata
    with pdfplumber.open(file_path) as pdf:
        metadata = pdf.metadata  # Extract metadata like title, author, etc.
        print("PDF Metadata:", metadata)
    
    # Step 2: Load the PDF content using LangChain's PDFPlumberLoader
    loader = PDFPlumberLoader(file_path)
    pdf_documents = loader.load()
    for doc in pdf_documents:
        # Add metadata to each document indicating its source
        doc.metadata = {
            "title": metadata.get("Title", "Unknown Title"),
            "author": metadata.get("Author", "Unknown Author"),
            "subject": metadata.get("Subject", "Unknown Subject"),
            "creation_date": metadata.get("CreationDate", "Unknown Date"),
        }
    """
    #Step 2: Load Multiple pDF Documents
    # Ensure the books directory exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(
            f"The directory {file_path} does not exist. Please check the path."
        )

    # List all text files in the directory
    pdf_files = [f for f in os.listdir(file_path) if f.endswith(".pdf")]

    # Read the text content from each file and store it with metadata
    pdf_documents = []
    for pdf_file in pdf_files:
        file_path = os.path.join(file_path, pdf_file)
        loader = PDFPlumberLoader(file_path)
        pdf_docs = loader.load()
      
        for doc in pdf_docs:
            # Add metadata to each document indicating its source
            doc.metadata = {"source": pdf_file}
            pdf_documents.append(doc)
    

    # Step 3: Apply a text splitter for complex PDF processing
    splitter = text_splitter(document=pdf_documents, chunk_size= 1000,chunk_overlap=2, option=0)
    split_docs = splitter.split_documents(pdf_documents)
    return split_docs

def document_transformation_text(directory_path, chunk_size=1000, chunk_overlap=100, splitter_option=0):
    """
    Transforms the text documents in the specified directory by applying a text splitter and returning split documents.
    
    :param directory_path: Path to the directory containing text files
    :param chunk_size: Size of each chunk
    :param chunk_overlap: Overlap between chunks
    :param splitter_option: The text splitter to use. 0 for RecursiveCharacter, 1 for Character, 2 for Token
    :return: List of split documents
    """
    
    # Step 1: Ensure the directory exists
    if not os.path.exists(directory_path):
        raise FileNotFoundError(f"The directory {directory_path} does not exist. Please check the path.")
    
    # Step 2: List all text files in the directory
    text_files = [f for f in os.listdir(directory_path) if f.endswith(".txt")]

    # Step 3: Read the text content from each file
    documents = []
    for text_file in text_files:
        file_path = os.path.join(directory_path, text_file)
        loader = TextLoader(file_path, encoding="utf-8")
        text_docs = loader.load()

        # Add each document to the documents list
        documents.extend(text_docs)
    
    # Step 4: Apply the selected text splitter
    splitter = text_splitter(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap, option=splitter_option)

    # Step 5: Split the documents using the chosen splitter
    split_docs = splitter.split_documents(documents)

    return split_docs




"""
-------------------------------------
Text splitter
chunk_size determines the maximum number of characters in each chunk after splitting the text.
chunk_overlap is the number of characters that overlap between consecutive chunks. This ensures important information that might fall near chunk boundaries is not lost when splitting.
-------------------------------------
"""
def text_splitter(document, chunk_size,chunk_overlap, option):
    # Choose a text splitter based on the option provided
    if option == 0:
        # Recursive Character Splitter
        """
        A recursive character text splitter is more intelligent than a simple character-based splitter. It tries to split the text at higher-level semantic boundaries first (e.g., paragraphs, sentences) and falls back to character splitting only if necessary.

        Pros:
        Maintains Context: It attempts to split on more logical breaks (e.g., between paragraphs) to preserve semantic meaning.
        Flexibility: Falls back to finer-grained splits when higher-level splits are too long.
        Overlapping Chunks: It allows for overlapping text between chunks, which can be useful for preserving context in downstream tasks like QA or retrieval.
        
        Cons:
        Complexity: More computationally expensive and slower compared to simple character-based splitters.
        Less Predictable Chunk Sizes: Depending on the structure of the document, chunk sizes may vary.
        
        Use Case:
        Ideal for documents where maintaining semantic boundaries is important, such as legal documents or scientific papers.
        """
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    elif option == 1:
        # Simple Character Splitter
        """
            A character-based text splitter splits text based on character count. This method cuts the text at a certain character length while avoiding breaking words in the middle.

            Pros:
            Simplicity: Easy to implement and ensures that you don't exceed token limits.
            Control: You can precisely control the size of the text chunks by adjusting the character limit.
            Consistent Chunk Size: It guarantees that chunks are consistently sized.

            Cons:
            Lacks Semantic Understanding: It may cut the text at inappropriate points, such as between sentences, paragraphs, or logical sections.
            May Break Context: Since it doesn’t account for semantic boundaries, important context may get split across chunks.
            
            Use Case:
            Simple text splitting for applications like summarization where chunk boundaries are less important.
        """
        splitter = CharacterTextSplitter(separator="\n\n",chunk_size=chunk_size,chunk_overlap=chunk_overlap,length_function=len,is_separator_regex=False)
    elif option == 2:
        # Semantic Chunking based on Sentence Transformers
        """
        Semantic chunking groups text into chunks based on the semantic meaning or similarity between sentences, instead of purely relying on token or character count. It uses models like Sentence-BERT to compute sentence embeddings and organizes sentences with similar meanings into the same chunk.

        Pros:
        Maintains Contextual Integrity: By grouping semantically related sentences together, it helps preserve the meaning of the text and ensures that ideas and concepts remain intact.
        Improved Performance for NLP Tasks: For tasks like question answering and summarization, it can improve the performance of models by providing them with semantically coherent chunks.
        Handles Complex Documents: Works well for complex documents where it's important to keep related content together, such as research papers, legal documents, or technical reports.
        
        Cons:
        Computationally Expensive: Requires calculating sentence embeddings, which can be slow and resource-intensive, especially for large documents.
        Inconsistent Chunk Sizes: Chunk sizes may vary significantly since it's based on semantic similarity rather than strict token or character limits.
        Less Predictable: The chunks might not be evenly distributed in terms of size or structure, which can complicate downstream tasks if consistency is needed.
        
        Use Case:
        Semantic chunking is ideal for use cases where maintaining the relationship between sentences is crucial, such as question answering, semantic search, and summarization of complex, structured documents (e.g., technical manuals, legal texts, or academic papers).
                
        """
        splitter = SemanticChunker(AzureOpenAIEmbeddings(model="text-embedding-3-small",
                                                         azure_endpoint="https://langchai-introduction.openai.azure.com/",
                                                         api_key="API KEY HERE",
                                                         openai_api_version="2023-03-15-preview"))
    else:
        raise ValueError("Invalid option. Choose between 0 (Recursive), 1 (Character),  2 (Semantic Chunking).")
    
    return splitter


In [21]:
"""
--------------------------------------
Embedding / Vector Store Generation
--------------------------------------
"""
def embed_document(doc_chunks):
    """
    The reason for having these as two separate methods is that some embedding providers have different embedding methods for documents (to be searched over) vs queries (the search query itself). 
    .embed_query will return a list of floats, whereas .embed_documents returns a list of lists of floats.
    
    """
    azure_openai_embedding = AzureOpenAIEmbeddings(model="text-embedding-3-small",
                                                    azure_endpoint="https://langchai-introduction.openai.azure.com/",
                                                    api_key="API KEY HERE",
                                                    openai_api_version="2023-03-15-preview")
    doc_embeddings = azure_openai_embedding.embed_documents([doc.page_content for doc in doc_chunks]) #embed document
    #embeddings = [azure_openai_embedding.embed_query(doc.page_content) for doc in doc_chunks] #embed query
    vector_store = FAISS.from_texts([doc.page_content for doc in doc_chunks], azure_openai_embedding)
    vector_store.save_local('faiss_index')
    return vector_store

In [33]:
import os
from dotenv import load_dotenv
from pathlib import Path
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_core.output_parsers import StrOutputParser
from langsmith import trace






# Define the path to the .env file
env_path = Path(r'C:\Users\AnumKhattak\OneDrive - Revolution Data Platforms\Documents\LangChain\Examples\.env')

# Load the .env file
load_dotenv(dotenv_path=env_path)

# Create the AzureChatOpenAI model instance
llm = AzureChatOpenAI(
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0.6,
)







# Setup the conversation chain
def setup_conversation_chain(vector_store):
    # Memory for the conversation
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    # Create the ConversationalRetrievalChain
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever(),
        memory=memory,
        verbose=True
    )

    return conversation_chain

# Function to ask the chatbot a question
def ask_chatbot(question, conversation_chain):
    response = conversation_chain.run({"question": question})  # Fix the input key to match the prompt template
    return response



azure_openai_embedding = AzureOpenAIEmbeddings(model="text-embedding-3-small",
                                                    azure_endpoint="https://langchai-introduction.openai.azure.com/",
                                                    api_key="API KEY HERE",
                                                    openai_api_version="2023-03-15-preview")
document_path = r'C:\Users\AnumKhattak\OneDrive - Revolution Data Platforms\Documents\LangChain\Examples\document'

# Call document transformation function (ensure this function is defined elsewhere)
split_data = document_transformation_text(document_path)

# Embed the document into a vector store (ensure embed_document is defined)
vector_store = embed_document(split_data)

# Set up the conversation chain
conversation_chain = setup_conversation_chain(vector_store)
input_data = "Tell me about the document?"

# Ask the chatbot a question
response = ask_chatbot(input_data , conversation_chain)

input_data = "Why use Docker?"
# Ask the chatbot a question
response = ask_chatbot(input_data , conversation_chain)

input_data = "What was my first question?"
# Ask the chatbot a question
response = ask_chatbot(input_data , conversation_chain)

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")
Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Got a question for me? Please mention it in the comments section and I will get back to you.

Upcoming Batches For DevOps Certification Training Course

Docker has gained immense popularity in this fast growing IT world. Organizations are continuously adopting it in their production environment. I take this opportunity to explain Docker in the most simple way. In this blog, the following concepts will be covered:

History Before containerization
Reasons to use containers
What is Docker?
Dockerfile, Images  & Containers
Docker Compose & Docker Swarm
Hands-On
Since Docker is a containerization platform before I tell you about it, it’s important that you understand the history behind containerization.

History Before Containerization
Before containerization came into th

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")



[1m> Finished chain.[0m

[1m> Finished chain.[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Tell me about the document?
Assistant: I'm sorry, but I don't have enough information to answer your question. Could you please provide me with more context or specify which document you are referring to?
Follow Up Input: Why use Docker?
Standalone question:[0m

[1m> Finished chain.[0m


Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")
Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Docker's importance lies in the fact that it enables developers to ship code faster and more reliably by creating isolated environments that mimic production environments closely. This mitigates the common "works on my machine" problem in development.

1. Introduction to Docker
Docker is an open-source platform that automates the deployment, scaling, and management of applications inside lightweight containers. It allows developers to package an application with all of its dependencies, ensuring consistency across different environments. 
The concept of containers itself isn’t new— it originates from operating-system-level virtualization that has been around in forms such as jails, zones, and chroot. However, Docker made containers more accessible, lightweight, and p

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")



[1m> Finished chain.[0m

[1m> Finished chain.[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Tell me about the document?
Assistant: I'm sorry, but I don't have enough information to answer your question. Could you please provide me with more context or specify which document you are referring to?
Human: Why use Docker?
Assistant: The purpose of using Docker is to enable developers to package an application with all of its dependencies, ensuring consistency across different environments, and to create isolated environments that mimic production environments closely. This allows developers to ship code faster and more reliably, mitigating the common "works on my machine" problem in development. Docker also streamlines continuous integration/continuous deployment (CI/CD) pipelines, is perfect for microservices architecture,

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")
Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Got a question for me? Please mention it in the comments section and I will get back to you.

Upcoming Batches For DevOps Certification Training Course

Docker Image: In layman terms, Docker Image can be compared to a template which is used to create Docker Containers. So, these read-only templates are the building blocks of a Container. You can use docker run to run the image and create a container.

Docker Images are stored in the Docker Registry. It can be either a user’s local repository or a public repository like a Docker Hub which allows multiple users to collaborate in building an application.

Docker Container: It is a running instance of a Docker Image as they hold the entire package needed to run the application. So, these are basically the ready applicati

In [7]:
azure_openai_embedding = AzureOpenAIEmbeddings(model="text-embedding-3-small",
                                                    azure_endpoint="https://langchai-introduction.openai.azure.com/",
                                                    api_key="API KEY HERE",
                                                    openai_api_version="2023-03-15-preview")
path = r'C:\Users\AnumKhattak\OneDrive - Revolution Data Platforms\Documents\LangChain\Examples\document'
split_data = document_transformation_text(path)

vector_store = embed_document(split_data)

In [34]:
query = "What is the docker?"
query_embedding = azure_openai_embedding.embed_query(query)

# Search for the most similar documents in the vector store
similar_docs = vector_store.similarity_search(query, k=2)

# Display the retrieved documents
for doc in similar_docs:
    print(doc.page_content)



Docker: An In-Depth Overview
Docker's importance lies in the fact that it enables developers to ship code faster and more reliably by creating isolated environments that mimic production environments closely. This mitigates the common "works on my machine" problem in development.


In [None]:
import os
from openai import OpenAI
from langsmith import traceable
from langsmith.wrappers import wrap_openai

"""
--------------------------------------
Observability with LangSmith
--------------------------------------
"""
# Set LangSmith observability environment variables
os.environ["LANGCHAIN_API_KEY"] = "API KEY"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

openai_client = wrap_openai(
    OpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),  # Set this in your environment
        api_base="API BASE",  # Your Azure resource base
        api_type="azure",
        api_version=os.getenv("AZURE_OPENAI_API_VERSION")  # Set API version in environment
    )
)

openai_client.invoke("What is 2+3?")
