In [1]:
from llama_parse import LlamaParse
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from llama_index.postprocessor.cohere_rerank import CohereRerank
from getpass import getpass
from llama_index.llms.groq import Groq


In [2]:
import nest_asyncio
nest_asyncio.apply()


import os
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-"

# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"]="sk"
os.environ["GROQ_API_KEY"]="gsk_"

In [3]:
EMBEDDING_MODEL  = "text-embedding-3-small"
GENERATION_MODEL = "llama3-groq-8b-8192-tool-use-preview"

#AIzaSyDvOuy0pDIYOihEo6A2ccTEhUxTUo6XI3Q
llm = Groq(model=GENERATION_MODEL)
embed_model = OpenAIEmbedding(model=EMBEDDING_MODEL)

Settings.llm = llm
Settings.embed_model = embed_model

In [4]:
pdf_file_name = "1-s2.0-S2773186324000884-main.pdf"
parsing_instructions = "This is a pdf document with text table and image data and it is a research paper"

In [5]:
documents = LlamaParse(result_type="markdown", parsing_instructions=parsing_instructions).load_data(pdf_file_name)

Started parsing the file under job_id 062f7b73-6c6f-413f-9ff4-7a13a716b0aa


In [6]:
print(documents[0].text[:1000])

# Chaos-based audio encryption: Efficacy of 2D and 3D hyperchaotic systems

Thejas Haridas, Upasana S.D., Vyshnavi G., Malavika S. Krishnan, Sishu Shankar Muni *

School of Digital Sciences, Digital University Kerala, Thiruvananthapuram, PIN 695317, Kerala, India

# A R T I C L E I N F O

# A B S T R A C T

Keywords: Secure communication in the digital age is necessary; securing audio data becomes very critical since this is normally transmitted across susceptible networks. Traditional approaches to encryption, like Advanced Encryption Standard and Rivest–Shamir–Adleman, are pretty solid but mostly too computationally intensive for real-time audio applications. This paper presents a new audio encryption scheme using chaotic systems, characterized by high sensitivity to initial conditions, pseudo-randomness, and determinism. These properties make chaotic systems ideal for generating keys for cryptographic purposes. Indeed, complex keys that would be nearly impossible to reverse-engineer

In [7]:
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8).from_defaults()

In [8]:
nodes = node_parser.get_nodes_from_documents(documents)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
3it [00:00, ?it/s]
3it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
2it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, ?it/s]
3it [00:00, ?it/s]
3it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [9]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [10]:
print(base_nodes[2].text[:])

The mod 1023 function [21] is utilized to enhance four chaotic maps for encrypting RGB images in a machine scheme through message queuing telemetry transport on Wifi and the Internet. A built-in cryptosystem utilizing chaos principles with a voice recognition access key that is executed on an FPGA platform [22]. Also, the implementations consist of independent graphical user interfaces that can be utilized on various devices such as microcomputers, computers, etc. A pseudorandom generator based on two-dimensional Henon-Sine hyperchaotic map for microcontrollers is an illustration [23], and also a secure communication system implemented on a microcontroller [24] using a five-term 3D chaotic map [25].

In recent academic works, various research studies have delved into different methods to protect speech. A unique two-phase speech encryption method, utilizes an innovative chaotic map, 2D-LMSM, Fast Fourier Transform, and the Discrete Wavelet Transform [26]. This strategy has shown impres

In [11]:
print(objects[0].obj.text[:])

Audio files with their sizes and durations.,
with the following columns:
- Audio file: None
- File size: None
- Duration (s): None

|Audio file|File size|Duration (s)|
|---|---|---|
|CantinaBand3|129 KB|3|
|PinkPanther60|2.52 MB|60|
|StarWars60|2.52 MB|60|



In [27]:
# PDF Post-Parsing Analysis and Interaction

import os
import nest_asyncio
nest_asyncio.apply()

# Importing required libraries
from llama_parse import LlamaParse
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.groq import Groq
from llama_index.embeddings.openai import OpenAIEmbedding


# Model and Parsing Setup
def setup_models():
    """Setup language and embedding models"""
    llm = Groq(model="llama3-groq-8b-8192-tool-use-preview")
    embed_model = OpenAIEmbedding(model="text-embedding-3-small")
    
    Settings.llm = llm
    Settings.embed_model = embed_model
    
    return llm, embed_model

# Parse PDF
def parse_pdf(pdf_path):
    """Parse PDF and return documents"""
    parsing_instructions = (
        "This is a pdf document with text, tables, and image data. "
        "Extract detailed information carefully."
    )
    return LlamaParse(
        result_type="markdown", 
        parsing_instructions=parsing_instructions
    ).load_data(pdf_path)

# Setup models and parse PDF
llm, embed_model = setup_models()
pdf_path = "1-s2.0-S2773186324000884-main.pdf"
documents = parse_pdf(pdf_path)

# Document Analysis Functions
def analyze_document_structure():
    """
    Analyze and print document structure details
    """
    print("📄 Document Analysis:")
    print(f"Total Documents Parsed: {len(documents)}")
    
    for i, doc in enumerate(documents, 1):
        print(f"\nDocument {i}:")
        print(f"  Total Length: {len(doc.text)} characters")
        print(f"  Metadata Keys: {list(doc.metadata.keys())}")
        
        # Preview first 500 characters
        preview = doc.text[:500].replace('\n', ' ')
        print(f"  Preview: {preview}...")

def extract_key_sections():
    """
    Extract and categorize key sections from the document
    """
    print("\n🔍 Document Section Extraction:")
    sections = {}
    
    # Simple section identification (can be enhanced)
    section_keywords = [
        'introduction', 'background', 'method', 'methodology', 
        'results', 'discussion', 'conclusion', 'abstract'
    ]
    
    for doc in documents:
        text = doc.text.lower()
        for keyword in section_keywords:
            if keyword in text:
                # Find section boundaries (basic approach)
                start = text.find(keyword)
                end = text.find('\n\n', start)
                
                if end == -1:
                    end = len(text)
                
                section_text = doc.text[start:end]
                sections[keyword] = section_text.strip()
    
    # Print identified sections
    for section, content in sections.items():
        print(f"\n{section.upper()} (first 300 chars):")
        print(content[:300] + "...")

def generate_document_summary():
    """
    Generate a comprehensive summary using Groq
    """
    print("\n📝 Document Summary Generation:")
    
    summary_prompt = (
        "Provide a comprehensive, academic-style summary of the document. "
        "Include main objectives, key methodologies, significant findings, "
        "and potential implications. Be concise but thorough."
    )
    
    # Combine all text for summary generation
    full_text = " ".join([doc.text for doc in documents])
    
    # Truncate to manageable length if extremely long
    max_length = 10000
    full_text = full_text[:max_length]
    
    # Use Groq to generate summary
    summary_response = llm.complete(
        f"Document Context:\n{full_text}\n\n{summary_prompt}"
    )
    
    print(summary_response.text)

def keyword_extraction():
    """
    Extract and rank important keywords
    """
    print("\n🏷️ Keyword Extraction:")
    
    from collections import Counter
    import re
    
    # Combine all text
    full_text = " ".join([doc.text for doc in documents])
    
    # Basic keyword extraction (can be improved)
    # Remove common words and punctuation
    words = re.findall(r'\b\w+\b', full_text.lower())
    stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    
    # Filter out stop words and short words
    filtered_words = [
        word for word in words 
        if word not in stop_words and len(word) > 3
    ]
    
    # Count and display top keywords
    keyword_counts = Counter(filtered_words)
    top_keywords = keyword_counts.most_common(10)
    
    print("Top Keywords:")
    for keyword, count in top_keywords:
        print(f"  {keyword}: {count} occurrences")

# Main Execution
def main():
    analyze_document_structure()
    extract_key_sections()
    generate_document_summary()
    keyword_extraction()

# Run analysis
if __name__ == "__main__":
    main()

# Interactive Exploration
# Uncomment and run individual functions as needed
# analyze_document_structure()
# extract_key_sections()
# generate_document_summary()
# keyword_extraction()

Started parsing the file under job_id b86edf5d-a809-4052-8f2d-e7ba349adb35
📄 Document Analysis:
Total Documents Parsed: 17

Document 1:
  Total Length: 4947 characters
  Metadata Keys: []
  Preview: # Chaos-based audio encryption: Efficacy of 2D and 3D hyperchaotic systems  Thejas Haridas, Upasana S.D., Vyshnavi G., Malavika S. Krishnan, Sishu Shankar Muni *  School of Digital Sciences, Digital University Kerala, Thiruvananthapuram, PIN 695317, Kerala, India  # A R T I C L E I N F O  # A B S T R A C T  Keywords: Secure communication in the digital age is necessary; securing audio data becomes very critical since this is normally transmitted across susceptible networks. Traditional approache...

Document 2:
  Total Length: 8209 characters
  Metadata Keys: []
  Preview: # T. Haridas et al.  # Franklin Open 8 (2024) 100158  The audio format should be digitized, that is, changing the analog audio signals to a digital format. This digital data can be treated and encrypted.  The chaotic sign

AuthenticationError: Error code: 401 - {'error': {'message': 'Invalid API Key', 'type': 'invalid_request_error', 'code': 'invalid_api_key'}}