In [1]:
import os
import nest_asyncio
nest_asyncio.apply()

# Importing required libraries
from llama_parse import LlamaParse
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.groq import Groq
from llama_index.embeddings.openai import OpenAIEmbedding


In [5]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()


import os
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-"

# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"]="sk-"
# Using Cohere for reranking
#os.environ["COHERE_API_KEY"] = "xyz..."

os.environ["GROQ_API_KEY"]="gsk_"


In [6]:
def setup_models():
    """Setup language and embedding models"""
    llm = Groq(model="llama3-groq-8b-8192-tool-use-preview")
    embed_model = OpenAIEmbedding(model="text-embedding-3-small")
    Settings.llm = llm
    Settings.embed_model = embed_model
    
    return llm, embed_model

In [7]:
# Parse PDF
def parse_pdf(pdf_path):
    """Parse PDF and return documents"""
    parsing_instructions = (
        "This is a pdf document with text, tables, and image data. "
        "Extract detailed information carefully."
    )
    return LlamaParse(
        result_type="markdown", 
        parsing_instructions=parsing_instructions
    ).load_data(pdf_path)

In [8]:
# Setup models and parse PDF
llm, embed_model = setup_models()
pdf_path = "1-s2.0-S2773186324000884-main.pdf"
documents = parse_pdf(pdf_path)

Started parsing the file under job_id 8484793f-9a1d-45a7-ace4-b6af3a2b3620


In [9]:
def analyze_document_structure():
    """
    Analyze and print document structure details
    """
    print("📄 Document Analysis:")
    print(f"Total Documents Parsed: {len(documents)}")
    
    for i, doc in enumerate(documents, 1):
        print(f"\nDocument {i}:")
        print(f"  Total Length: {len(doc.text)} characters")
        print(f"  Metadata Keys: {list(doc.metadata.keys())}")
        
        # Preview first 500 characters
        preview = doc.text[:500].replace('\n', ' ')
        print(f"  Preview: {preview}...")

In [10]:
def extract_key_sections():
    """
    Extract and categorize key sections from the document
    """
    print("\n🔍 Document Section Extraction:")
    sections = {}
    
    # Simple section identification (can be enhanced)
    section_keywords = [
        'introduction', 'background', 'method', 'methodology', 
        'results', 'discussion', 'conclusion', 'abstract'
    ]
    
    for doc in documents:
        text = doc.text.lower()
        for keyword in section_keywords:
            if keyword in text:
                # Find section boundaries (basic approach)
                start = text.find(keyword)
                end = text.find('\n\n', start)
                
                if end == -1:
                    end = len(text)
                
                section_text = doc.text[start:end]
                sections[keyword] = section_text.strip()
    
    # Print identified sections
    for section, content in sections.items():
        print(f"\n{section.upper()} (first 300 chars):")
        print(content[:300] + "...")

In [75]:
def generate_document_summary():
    """
    Generate a comprehensive summary using Groq
    """
    print("\n📝 Document Summary Generation:")
    
    summary_prompt = (
        "Provide a comprehensive, academic-style summary of the document. "
        "Include main objectives, key methodologies, significant findings, "
        "and potential implications. Be concise but thorough."
        "how many tables are their and explain the tables"
        "explain each table in detile with respect to the pdf file"
        "explation of numerical data and its implications"
    )
    
    # Combine all text for summary generation
    full_text = " ".join([doc.text for doc in documents])
    
    # Truncate to manageable length if extremely long
    max_length = 26000
    full_text = full_text[:max_length]
    
    # Use Groq to generate summary
    summary_response = llm.complete(
        f"Document Context:\n{full_text}\n\n{summary_prompt}"
    )
    
    print(summary_response.text)

In [74]:
def QnA():
    """
    Generate a comprehensive summary using Groq
    """
    print("\n📝 Document Summary Generation:")
    summary_prompt=input("enter the question")

    
    # Combine all text for summary generation
    full_text = " ".join([doc.text for doc in documents])
    
    # Truncate to manageable length if extremely long
    max_length = 26000
    full_text = full_text[:max_length]
    
    # Use Groq to generate summary
    summary_response = llm.complete(
        f"Document Context:\n{full_text}\n\n{summary_prompt}"
    )
    
    print(summary_response.text)

In [63]:
def keyword_extraction():
    """
    Extract and rank important keywords
    """
    print("\n🏷️ Keyword Extraction:")
    
    from collections import Counter
    import re
    
    # Combine all text
    full_text = " ".join([doc.text for doc in documents])
    
    # Basic keyword extraction (can be improved)
    # Remove common words and punctuation
    words = re.findall(r'\b\w+\b', full_text.lower())
    stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'this'])
    
    # Filter out stop words and short words
    filtered_words = [
        word for word in words 
        if word not in stop_words and len(word) > 3
    ]
    
    # Count and display top keywords
    keyword_counts = Counter(filtered_words)
    top_keywords = keyword_counts.most_common(10)
    
    print("Top Keywords:")
    for keyword, count in top_keywords:
        print(f"  {keyword}: {count} occurrences")

In [54]:
analyze_document_structure()

📄 Document Analysis:
Total Documents Parsed: 17

Document 1:
  Total Length: 4947 characters
  Metadata Keys: []
  Preview: # Chaos-based audio encryption: Efficacy of 2D and 3D hyperchaotic systems  Thejas Haridas, Upasana S.D., Vyshnavi G., Malavika S. Krishnan, Sishu Shankar Muni *  School of Digital Sciences, Digital University Kerala, Thiruvananthapuram, PIN 695317, Kerala, India  # A R T I C L E I N F O  # A B S T R A C T  Keywords: Secure communication in the digital age is necessary; securing audio data becomes very critical since this is normally transmitted across susceptible networks. Traditional approache...

Document 2:
  Total Length: 8209 characters
  Metadata Keys: []
  Preview: # T. Haridas et al.  # Franklin Open 8 (2024) 100158  The audio format should be digitized, that is, changing the analog audio signals to a digital format. This digital data can be treated and encrypted.  The chaotic signal formed in the earlier stage can be used as the lead for encrypting the a

In [55]:
extract_key_sections()


🔍 Document Section Extraction:

INTRODUCTION (first 300 chars):
introduction to a cryptosystem based on chaos for high-resolution digital photos, utilizing an arbitrary precision arithmetic digital chaos generator, running on the Snapdragon Pi 3 SoC [20]. The mod 1023 function [21] is utilized to enhance four chaotic maps for encrypting RGB images in a machine s...

METHOD (first 300 chars):
method based on Chebyshev map, Orclever Proc. Res. Dev. 2 (1) (2023) 28–38....

METHODOLOGY (first 300 chars):
Methodology, Formal analysis, Data curation, Conceptualization....

RESULTS (first 300 chars):
results affirm the suitability of chaotic maps for encryption purposes as they offer high levels of secrecy and security....

CONCLUSION (first 300 chars):
Conclusion...

ABSTRACT (first 300 chars):
abstract, even extremely small variations of the key result in very different outcomes in the end, hence increasing the security. The intrinsic complexity of the chaotic systems also gives an added l

In [76]:
generate_document_summary()


📝 Document Summary Generation:
The document discusses the efficacy of using 2D and 3D hyperchaotic systems for audio encryption. The main objectives are to explore the use of chaotic systems in securing audio data and to evaluate the effectiveness of two distinct encryption schemes based on these systems. The methodologies involve the use of chaotic maps to encrypt audio data, including the 2D quadratic memristor map and the 3D hyperchaotic map. The significant findings include the ability of these systems to provide strong security for audio data while maintaining acceptable audio quality. The potential implications are far-reaching, offering a secure and efficient solution for audio communications in various fields, including telecommunications, military communications, and confidential conferencing systems.


In [61]:
keyword_extraction()


🏷️ Keyword Extraction:
Top Keywords:
  audio: 184 occurrences
  encryption: 176 occurrences
  chaotic: 89 occurrences
  signal: 74 occurrences
  that: 71 occurrences
  encrypted: 65 occurrences
  data: 62 occurrences
  hyperchaotic: 52 occurrences
  original: 48 occurrences
  using: 44 occurrences


In [83]:
QnA()


📝 Document Summary Generation:
The provided process explains how to encrypt an audio file using signal processing and cryptography techniques. This encryption method is outlined below in a logical sequence which is represented in the flowchart 3. First, the original audio file (org_aud) is used to start the encryption process. To separate the raw audio data for additional processing, the audio file’s header — which usually includes metadata like file format and properties — is removed. By eliminating additional potentially identifying information, this step ensures the encryption is restricted to the audio material. After that, the SHA3-512 algorithm [43] is used to hash the isolated audio data, the data of audio is given in the Table 1. A fixed-size, 512-bit hash value that precisely represents the audio file’s contents generated by this cryptographic hash function. The hashing process secures the uniqueness and integrity of the audio by assuring that even slight alterations in the a

In [50]:
def main():
    analyze_document_structure()
    extract_key_sections()
    generate_document_summary()
    keyword_extraction()
    QnA()

In [53]:
main()

📄 Document Analysis:
Total Documents Parsed: 17

Document 1:
  Total Length: 4947 characters
  Metadata Keys: []
  Preview: # Chaos-based audio encryption: Efficacy of 2D and 3D hyperchaotic systems  Thejas Haridas, Upasana S.D., Vyshnavi G., Malavika S. Krishnan, Sishu Shankar Muni *  School of Digital Sciences, Digital University Kerala, Thiruvananthapuram, PIN 695317, Kerala, India  # A R T I C L E I N F O  # A B S T R A C T  Keywords: Secure communication in the digital age is necessary; securing audio data becomes very critical since this is normally transmitted across susceptible networks. Traditional approache...

Document 2:
  Total Length: 8209 characters
  Metadata Keys: []
  Preview: # T. Haridas et al.  # Franklin Open 8 (2024) 100158  The audio format should be digitized, that is, changing the analog audio signals to a digital format. This digital data can be treated and encrypted.  The chaotic signal formed in the earlier stage can be used as the lead for encrypting the a