In [110]:
###install required libraries 
# !pip install langchain langchain_huggingface chromadb transformers
# !pip install "unstructured[pdf]" 
# !pip install langchain-groq


In [2]:
"""description 
The provided code uses Langchain's DirectoryLoader to load all PDF documents from a specified directory. 
It uses the glob="*.pdf" pattern to match and load only .pdf files. The load() method returns a list of Document 
objects containing the content from the PDFs, which can then be used for further processing or analysis.
"""



# Import the necessary class from langchain.document_loaders to load PDFs from a directory
from langchain.document_loaders import DirectoryLoader

# Create an instance of DirectoryLoader to load PDF documents from a specified directory
loader = DirectoryLoader(
    "./research_ppr",  # Path to the directory containing the PDFs
    glob="*.pdf",  # Specify a pattern to match the PDF files (e.g., "*.pdf" for all PDFs)
)

# Load the documents from the directory
documents = loader.load()  # This loads the matched PDF files and returns them as a list of Document objects


In [3]:
#check documents
documents

 Document(metadata={'source': 'research_ppr/GRU.pdf'}, page_content="Gate-Variants of Gated Recurrent Unit (GRU) Neural Networks Rahul Dey and Fathi M. Salem Circuits, Systems, and Neural Networks (CSANN) LAB Department of Electrical and Computer Engineering Michigan State University East Lansing, MI 48824-1226, USA deyrahul@msu.edu || salemf@msu.edu\n\nAbstract ‚Äì The paper evaluates three variants of the Gated Recurrent Unit (GRU) in recurrent neural networks (RNN) by reducing parameters in the update and reset gates. We evaluate the three variant GRU models on MNIST and IMDB datasets and show that these GRU-RNN variant models perform as well as the the original GRU RNN model while reducing computational expense.\n\nperformance sentiment classification from a given review paragraph.\n\nII. BACKGROUND: RNN, LSTM AND GRU\n\nIn principal, RNN are more suitable relationships among sequential data types. The so-called simple RNN has a recurrent hidden state as in\n\nfor capturing\n\nI. I

In [4]:
###clean the documents/page content if required

"""Description 
    Clean the input text by removing unnecessary whitespace, non-alphanumeric characters, 
    and trimming leading/trailing spaces.

    This function performs the following operations:
    1. Replaces multiple consecutive whitespace characters (spaces, newlines, tabs) 
       with a single space.
    2. Removes all non-alphanumeric characters, retaining only letters (a-z, A-Z), 
       numbers (0-9), and spaces.
    3. Strips leading and trailing spaces from the text.
"""

import re  # Import the regular expression module for text processing

# Define a function to clean the text
def clean_text(text):
    # Remove extra whitespaces, newlines, and tabs by replacing all whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)  # This matches one or more whitespace characters (spaces, newlines, tabs) and replaces them with a single space
    # Remove non-alphanumeric characters (except spaces) using a regular expression
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # This matches any character that is not a letter (a-z, A-Z), a number (0-9), or whitespace (\s), and removes it
    # Remove leading and trailing spaces from the text
    text = text.strip()  # The .strip() method removes any whitespace at the start and end of the text
    return text  # Return the cleaned text





In [5]:
"""
    Description : 
    Clean and process a list of documents, splitting the content into chunks and adding metadata.

    This function performs the following steps for each document in the input list:
    1. Cleans the document's content by removing unnecessary whitespace, non-alphanumeric characters,
       and trimming leading/trailing spaces (via the `clean_text` function).
    2. Adds metadata to each chunk, including the document's filename (extracted from the document's source).
    3. Splits the cleaned content into smaller chunks using `RecursiveCharacterTextSplitter`, 
       ensuring each chunk is of a specified size (with an overlap between chunks).
    
    Args:
    documents (list): A list of Document objects, each containing `page_content` and `metadata`.

    Returns:
    list: A list of processed Document chunks, each with the cleaned content and associated metadata.
"""







from langchain_text_splitters import RecursiveCharacterTextSplitter  # Import the text splitter

# Clean the documents' page content and add filename as metadata
def clean_and_process_documents(documents):
    # Initialize the text splitter for chunking
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,  # Set a chunk size limit
        chunk_overlap=50,  # Set overlap between chunks
        length_function=len,  # Define how chunk lengths are calculated
        is_separator_regex=False,  # Whether the separator should be regex (False in this case)
    )

    chunks = []
    for doc in documents:
        # Clean the document's content
        cleaned_content = clean_text(doc.page_content)

        # Create metadata: use the document's source (filename) as metadata
        metadata = {"source": (doc.metadata.get('source', 'Unknown')).split("/")[-1]}
    

        # Split the document into chunks and append metadata
        chunk = text_splitter.create_documents([cleaned_content], metadatas=[metadata])
        chunks.extend(chunk)  # Add the chunks to the list

    return chunks

# Process the documents
processed_documents = clean_and_process_documents(documents)



In [6]:
processed_documents

[Document(metadata={'source': 'Recurrent_Neural_Networks_A_Comprehensive_Review_of_Architectures_Variants_and_Applications.pdf'}, page_content='information Review Recurrent Neural Networks A Comprehensive Review of Architectures Variants and Applications Ibomoiye Domor Mienye 1  Theo G Swart 1 and George Obaido 2 1 Institute for Intelligent Systems University of Johannesburg Johannesburg 2006 South Africa tgswartujacza 2 Center for HumanCompatible Artificial Intelligence CHAI Berkeley Institute for Data Science BIDS University of California Berkeley CA 94720 USA gobaidoberkeleyedu Correspondence ibomoiyemujacza  These authors contributed equally to this work Abstract Recurrent neural networks RNNs have significantly advanced the field of machine learn ing ML by enabling the effective processing of sequential data This paper provides a comprehensive review of RNNs and their applications highlighting advancements in architectures such as long shortterm memory LSTM networks gated recurren

In [7]:
print(processed_documents[0].page_content)

information Review Recurrent Neural Networks A Comprehensive Review of Architectures Variants and Applications Ibomoiye Domor Mienye 1  Theo G Swart 1 and George Obaido 2 1 Institute for Intelligent Systems University of Johannesburg Johannesburg 2006 South Africa tgswartujacza 2 Center for HumanCompatible Artificial Intelligence CHAI Berkeley Institute for Data Science BIDS University of California Berkeley CA 94720 USA gobaidoberkeleyedu Correspondence ibomoiyemujacza  These authors contributed equally to this work Abstract Recurrent neural networks RNNs have significantly advanced the field of machine learn ing ML by enabling the effective processing of sequential data This paper provides a comprehensive review of RNNs and their applications highlighting advancements in architectures such as long shortterm memory LSTM networks gated recurrent units GRUs bidirectional LSTM BiLSTM echo state networks ESNs peephole LSTM and stacked LSTM The study examines the application of RNNs to dif

In [8]:
print(processed_documents[1].page_content)

the order of data points are crucial Copyright  2024 by the authors Licensee MDPI Basel Switzerland This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution CC BY license https creativecommonsorglicensesby The inception of RNNs dates back to the 1970s with foundational work by Werbos 13 which introduced the concept of backpropagation through time BPTT that laid the foun dation for training recurrent neural networks However RNNs struggled with practical applications due to the vanishing gradient problem where gradients either grow or shrink exponentially during backpropagation 14 Meanwhile the introduction of Long Short Term Memory LSTM networks by Hochreiter and Schmidhuber 15 was a turning point for RNNs allowing for the learning of dependencies over much longer periods Addi tionally gated recurrent units GRUs proposed by Cho et al 16 offered a simplified alternative to LSTM while maintaining comparable performance 40 Inform

In [9]:
print(processed_documents[10].page_content)

75 The candidate hidden state h t represents the new content to be added to the current hidden state modulated via the reset gate Furthermore the simplified architecture of GRUs allows them to be computationally more efficient than LSTM while still addressing the vanishing gradient problem This efficiency makes GRUs wellsuited for tasks where computational resources are limited or when training needs to be faster GRUs have been successfully applied in various sequence modeling tasks Their ability to capture longterm dependencies with fewer parameters makes them a popular choice in many applications Additionally studies have shown that GRUs can achieve performance comparable to LSTM 7880 making them an attractive alternative for many use cases Figure 5 Architecture of the GRU network Comparison with LSTM GRUs have fewer parameters compared to LSTM due to the absence of a separate cell state and combined gating mechanisms 81 This often leads to faster training times and comparable perfor

In [10]:
len(processed_documents)

132

In [16]:
"""Description
Initialize the HuggingFaceEmbeddings class with a pre-trained model for text embedding.

This function sets up the HuggingFaceEmbeddings class using the specified model from the
Hugging Face model hub. The model 'sentence-transformers/all-MiniLM-L6-v2' is a smaller,
efficient version of the MiniLM model, optimized for generating dense vector embeddings of text.

Args:
    model_name (str): The name of the pre-trained model to use for embedding generation. 
                       In this case, the 'sentence-transformers/all-MiniLM-L6-v2' model is used, which 
                       is optimized for sentence-level embeddings.

Returns:
    HuggingFaceEmbeddings: An instance of the HuggingFaceEmbeddings class initialized with the chosen model.
"""



from langchain_huggingface import HuggingFaceEmbeddings

# Correct initialization for HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 

In [17]:
""" Description
Initialize a Chroma vector store to persist and retrieve text embeddings for efficient similarity search.

This function sets up the Chroma vector store, a highly scalable and efficient solution for storing and querying
dense vector embeddings generated by the HuggingFace model. The Chroma vector store allows for fast similarity 
searches on embedded documents, enabling tasks like document retrieval or search relevance ranking.

Args:
    collection_name (str): The name of the collection where the vector embeddings will be stored.
    embedding_function (function): The function used to generate the embeddings for the documents. 
                                   In this case, the embeddings are generated by a HuggingFace model.
    persist_directory (str, optional): The directory path where the vector store will be saved locally.
                                       This is useful if you want to persist the data across sessions.
                                       If persistence is not required, this argument can be omitted.

Returns:
    Chroma: An initialized Chroma vector store instance, capable of storing and querying document embeddings.
"""

from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="research_documents_vector",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [266]:
""" Description
Generate unique UUIDs for each document in the processed documents list.

This function creates a list of unique UUIDs (Universally Unique Identifiers) for each document
to serve as a unique identifier in the vector store. UUIDs are used to ensure that each document
can be uniquely referenced, stored, and retrieved in subsequent operations such as similarity searches.

Args:
    processed_documents (list): A list of processed document objects for which UUIDs will be generated.

Returns:
    list: A list of UUID strings, one for each document in the processed_documents list.

Usage:
    This list of UUIDs can be used as identifiers when storing the documents in a vector store 
    or database, ensuring that each document has a unique key for retrieval.
"""


# Add documents to the vector store
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(processed_documents))]

In [267]:
"""Description
Add processed documents to the vector store with associated unique identifiers (UUIDs).

This function is responsible for adding a list of processed documents to the vector store. The documents
are paired with unique identifiers (UUIDs) to ensure each document can be individually referenced and retrieved.
The `add_documents` method takes the processed documents and their corresponding UUIDs as input and stores them
in the vector store for future operations such as similarity search and retrieval.

Args:
    documents (list): A list of processed document objects that need to be added to the vector store.
    ids (list): A list of unique UUID strings, one for each document in the documents list, to serve as document IDs.

Returns:
    None: This method updates the vector store in-place and does not return any value.

Usage:
    This method is typically used after documents have been processed and embedded. The documents, now in vector
    form, are added to the vector store along with their UUIDs, making it easier to retrieve and identify specific documents
    later based on their embeddings.

Note:
    Ensure that the `documents` list and the `ids` list (UUIDs) have the same length. Each UUID will correspond to a document,
    providing a unique reference for each document in the vector store.
"""

vector_store.add_documents(documents=processed_documents, ids=uuids)

['afd9cb02-e1a2-48fe-bb56-eb920dcb3cfd',
 '786f6505-1dca-4245-8265-97dcc3df704f',
 'ea889b07-949e-418c-8903-df4682d104e7',
 'cbbcfea6-8644-4cb6-b59e-d6497a503e26',
 'd38c8538-00c5-434d-af02-b681cb99e032',
 'fc682595-ae84-45ac-af29-25a52580e2c9',
 'd8765172-f81c-4bea-8ff3-fa2c5453c89b',
 '529dde93-132c-49bb-a1e0-3eeeaee8a66e',
 'b757c2ee-f3b1-48c4-a9f9-c85e5b09f5e8',
 '66625301-bb45-4422-80d9-c59a1bb290bd',
 'f8e9aeff-3076-4aef-a030-23e63489d7a6',
 'bfa41a8b-1f78-443a-a8f5-a0f05482127f',
 '9a3b335e-6909-4077-b14f-e61153db65c9',
 '348bf88f-71d8-47b5-9921-f322933d31ce',
 'feed7ebc-0907-4092-bc50-aa5ae6bceb08',
 '536d5768-f394-4fa5-990a-0549c7cab401',
 'c3d36242-b6e6-4071-b7df-08d02e95a24c',
 '082a5ed1-48f8-4aa5-91c4-893c3772d4f6',
 '750a527e-f55f-4090-ac1a-08d96c7de47c',
 'c2fe56cb-0bc5-41eb-9ec1-63dafcdb8bfb',
 'f5460f2d-d00a-48e6-9dac-6198b3e663e1',
 '692e02ac-adf0-4010-a5d2-b41758dfb97e',
 'ac575b6f-3597-4f6d-aaee-7b57effde904',
 '0db288da-2d91-4132-a313-9b37e5bb1615',
 '3a777577-ec20-

In [11]:
"""Description
Collect unique sources from the processed documents.

This code iterates over a list of processed documents and extracts the unique 'source' metadata from each document.
The goal is to gather all the distinct sources (e.g., filenames, URLs, or other metadata) that contributed to the documents
in the vector store. The resulting list, `sources`, will contain only unique source names, ensuring that no duplicates are
included.

Args:
    processed_documents (list): A list of processed document objects, each with metadata containing a 'source' key.

Returns:
    sources (list): A list containing unique source names derived from the 'source' field in the document metadata.

Usage:
    This approach is useful for identifying all the distinct sources from which documents were processed or extracted.
    It helps in tracking and ensuring diversity of input sources, or it can be used for source-based grouping or analysis.

Note:
    The `sources` list will contain each source only once, even if multiple documents originate from the same source.
"""

sources = []
for i in processed_documents:
    if i.metadata["source"] not in sources:
        sources.append(i.metadata["source"])

In [12]:
sources

['Recurrent_Neural_Networks_A_Comprehensive_Review_of_Architectures_Variants_and_Applications.pdf',
 'GRU.pdf',
 'A_Comprehensive_Overview_and_Comparative_Analysis_on_Deep_Learning_Models_ CNN_RNN_LSTM_GRU.pdf',
 'sequence_to_sequence_learning.pdf',
 'attention_all_you_need.pdf']

In [62]:
Query="Can you explain the architecture of all sequence-to-sequence traditional models models like LSTM, GRU, and RNN, along with how Transformers \
       improved upon these models. also provide trasnformer architecture details"

In [63]:
"""Description
Perform similarity search for each source and accumulate relevant content.

This code performs a similarity search for each unique source in the `sources` list by querying the vector store
with the given query (`Query`) and filtering based on the `source` metadata. For each source, the top `k=5` most
relevant results are retrieved, and their page content is aggregated into a single string. The resulting text for
each source is appended to the `final_chunk` list, which will contain the concatenated content for each source.

Args:
    sources (list): A list of unique source names (e.g., filenames, URLs) from which documents were processed.
    Query (str): The search query string used to find relevant documents in the vector store.
    vector_store (Chroma): The vector store object containing the document embeddings.

Returns:
    final_chunk (list): A list where each element is a string containing the aggregated text from relevant
                        documents for each unique source.

Usage:
    This approach allows retrieving, displaying, and storing text that is most relevant to a query,
    organized by source. It helps in collecting results based on source metadata, which can then be analyzed or used
    for further processing.

Note:
    The search results for each source are aggregated into a single text, with the similarity scores printed
    alongside each page content. The final `final_chunk` list contains the combined content from the top `k` results
    for each source.
"""


final_chunk=[]
for source in sources:
    results = vector_store.similarity_search_with_score(Query, k=2, filter={"source": source}
    )
    text=""
    for res, score in results:
        print(f"* [SIM={score:3f}] {res.page_content}")
        text+=res.page_content
    final_chunk.append(text)    


* [SIM=0.791732] 7 discusses various applications of RNNs in the literature Section 8 addresses challenges and future research directions Finally Section 9 concludes the study 2 Related Works RNNs have been applied in different applications achieving stateoftheart perfor mance especially in timeseries applications Early developments in RNNs including locally recurrent globally feedforward networks as reviewed by Tsoi and Back 23 and the blockdiagonal recurrent neural networks proposed by Mastorocostas and Theocharis 24 laid important groundwork for understanding complex sequence modeling Several reviews have been conducted on RNNs and their applications each contribut ing to the understanding and development of the field For instance Dutta et al 25 provided a comprehensive overview of the theoretical foundations of RNNs and their applications in sequence learning Their review highlighted the challenges associated with training RNNs particularly the vanishing gradient problem and discus

In [64]:
print(len(final_chunk))

5


### Generating  summary using Groq API endpoint and by using llama3-70b-8192 model

In [21]:
#Refer provided link to generate GROQ API key https://groq.com/
# Set the Groq API key as an environment variable
import os
os.environ["GROQ_API_KEY"] = "YOUR-GROQ-API-KEY"


In [22]:
# Import the necessary libraries.
from langchain_core.prompts import ChatPromptTemplate # Importing ChatPromptTemplate to create structured chat prompts
from langchain_groq import ChatGroq # Importing ChatGroq to interact with Groq models

# Initialize a ChatGroq object with the model name and temperature
# 'temperature' controls the randomness of the model‚Äôs responses. A value of 0 will make the output deterministic, i.e., the same output for the same input.
# 'model_name' specifies the language model to use for generating responses. In this case, we are using the "llama3-70b-8192" model.
llm = ChatGroq(temperature=0, model_name="llama3-70b-8192")


In [65]:
"""
This script configures a language model to answer questions based on a set of research paper chunks, aiming to provide detailed, descriptive responses. 
It processes each chunk individually, gathers insights, and then combines these insights into a cohesive final answer. 

Code Description:
1. **System Initialization**: Sets up the language model with a system message to guide it to act as a research-focused question-answering expert. 
   The system prompt incorporates the main query as context to help guide each answer towards the question's needs. 

2. **Chunk-Level Answer Generation**:
   - For each chunk of the research papers, a prompt is created with a human message that combines the query and the specific chunk. 
   - This prompt is processed through the model to generate a relevant answer.
   - Each generated answer is appended to a list, `answers`, to collect responses from each chunk.

3. **Combining Chunk Answers**:
   - Once all chunks have been processed, the answers from each chunk are joined to form a shortened, combined text. This consolidated text serves as a summary of insights across all chunks.

4. **Final Answer Generation**:
   - A final prompt is created with the combined text to generate a complete and well-structured answer, synthesizing insights across all chunks to respond to the question effectively.
   
5. **Output**:
   - The final, cohesive answer is printed, providing a detailed response that draws on the entire set of research paper chunks.

This code ensures that the final answer is not only factually accurate but also clear, comprehensive, and directly relevant to the initial query.
"""


# System message setting up the model for providing detailed, descriptive answers based on research papers
system = (
    "You are an expert in question answering based on research paper analysis. Your task is to examine and address questions \
     by synthesizing detailed information from the provided research paper chunks. Use the context provided by the query: {}, \
     to carefully analyze each chunk, focusing on the most relevant key insights that answer the question directly. \
     provide comprehensive and well-structured descriptive answers with minimum tokens, ensuring clarity, coherence, and depth of explanation. "
).format(Query)

# List to collect individual answers from each chunk
answers = []
for chunk in final_chunk:
    # Setting up the prompt for each chunk with both system and human context
    human = f"Given the question: {Query}, answer based on the following context: {chunk}"
    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])
    chain = prompt | llm
    response = chain.invoke({"text": f"{chunk}"})
    output_response = clean_text(response.content)
    answers.append(response.content)
    print("answer added")

# Combine all individual answers into a smaller version of the full response
shortened_text = "\n".join(answers)

# Final summary to provide a cohesive answer from all chunks combined
final_answer_prompt = ChatPromptTemplate.from_messages(
    [("system", system), ("human", f"Answer to the question: {Query} based on the following context: {shortened_text}")]
)
final_chain = final_answer_prompt | llm
final_response = final_chain.invoke({"text": shortened_text})

# Print the final answer
print(final_response.content)


answer added
answer added
answer added
answer added
answer added
Here is a comprehensive explanation of the architecture of traditional sequence-to-sequence models like LSTM, GRU, and RNN, along with how Transformers improved upon these models, and Transformer architecture details:

**Traditional Sequence-to-Sequence Models:**

1. **Recurrent Neural Networks (RNNs):** RNNs are a type of neural network designed to handle sequential data. They have a feedback loop that allows information to persist, enabling the network to capture temporal dependencies in the data. RNNs are composed of three main components: an input gate, a hidden state, and an output gate. The hidden state captures information from previous time steps, and the output gate generates the output at each time step.

2. **Long Short-Term Memory (LSTM) Networks:** LSTMs are a variant of RNNs that address the vanishing gradient problem, which occurs when gradients are backpropagated through time, causing them to become smalle

In [67]:
"""Description
This script formats and saves the summary text generated by the model into a text file, ensuring the output is readable and well-structured.

Steps:
1. **Output Text Assignment**: The `final_response.content` is assigned to the `output_text` variable, which holds the content to be formatted and saved.

2. **File Preparation**: The `output_filename` is set to `"summary_output.txt"`, which specifies the file where the formatted text will be saved.

3. **Text Formatting**:
    - A `TextWrapper` object is used to wrap the text to a specific width (140 characters), with proper indentation (`4 spaces` for both initial and subsequent lines).
    - Additional space is added before the main content to improve readability, followed by wrapping the text to the specified width.
  
4. **Text Writing**: The formatted text is then written to the file. The extra spaces ensure key points stand out, and the content is neatly wrapped for easier reading.

5. **Final Output**: The script saves the formatted content to the file and prints a confirmation message indicating the file's location.

This process ensures that the summarized content is not only stored but is also presented in a way that is easy to read and understand, with well-structured formatting.
"""


import textwrap

# Sample output text (replace this with your actual response)
output_text = final_response.content

# Define file name
output_filename = "answer.txt"

# Open the file in write mode and format the text
with open(output_filename, "w") as file:
    # Create a wrapper for text formatting with 140 character width and indentation
    wrapper = textwrap.TextWrapper(width=140, initial_indent="    ", subsequent_indent="    ")

    # Add space before key points to improve readability
    formatted_text = "\n\n"  # Adds space before the content

    # Wrap the main content text for readability
    wrapped_text = wrapper.fill(str(output_text))
    
    # Add some extra space for main points separation
    formatted_text += wrapped_text

    # Write the formatted text to the file
    file.write(formatted_text)

print(f"Output saved to {output_filename}.")


Output saved to answer.txt.


In [None]:
#command to clear vector database
#vector_store.delete(ids=uuids) 