In [None]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import os
import io
import camelot

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_images(pdf_path, output_filename, max_pages=600):
    # Convert PDF to images
    pages = convert_from_path(pdf_path)
    pg_cntr = 1
    
    # Create sub-directory to store images
    sub_dir = str("images/" + pdf_path.split('/')[-1].replace('.pdf','')[0:510] + "/")
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)
    
    # Extract text using pytesseract and save to output file
    with io.open(output_filename, 'a+', encoding='utf8') as f:
        for page in pages:
            if pg_cntr <= max_pages:
                filename = f"pg_{pg_cntr}_{pdf_path.split('/')[-1].replace('.pdf','.jpg')}"
                page.save(sub_dir + filename, 'JPEG')
                f.write(f"\n======================================================== PAGE {pg_cntr} ========================================================\n")
                text = pytesseract.image_to_string(sub_dir + filename)
                f.write(text + "\n")
                f.write("======================================================== ========================= ========================================================\n")
                pg_cntr += 1
            else:
                break
    print(f"Text extraction completed. Results saved to {output_filename}")

def extract_tables_from_pdf(pdf_path):
    # Extract tables using Camelot
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')  # You can also try 'lattice' flavor
    if tables:
        for i, table in enumerate(tables):
            # Save each table to a CSV file
            csv_filename = f'table_{i + 1}.csv'
            table.to_csv(csv_filename)
            print(f"Table {i + 1} saved to {csv_filename}")
            # Optionally print the table as a DataFrame
            print(f"Table {i + 1} as DataFrame:\n", table.df)
    else:
        print("No tables found in the PDF.")
        
def main():
    pdf_path = 'SanstarLimited_RHP.pdf'  # Replace with your PDF file path
    output_filename = "results.txt"
    
    # Step 1: Extract text from PDF images using OCR
    extract_text_from_images(pdf_path, output_filename, max_pages=600)
    
    # Step 2: Extract tables from the PDF using Camelot
    #extract_tables_from_pdf(pdf_path)

if __name__ == "__main__":
    main()


In [1]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch

# Function to load text from a file and split it into chunks (e.g., paragraphs or sentences)
def load_text_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
    # Split text into chunks (you can adjust this based on your text structure)
    text_chunks = text.split('\n\n')  # Splitting by double newline assumes paragraphs are separated by double newlines
    return text_chunks

# Function to vectorize text
def vectorize_text(text_list):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Initialize the SentenceTransformer model
    embeddings = model.encode(text_list, convert_to_tensor=True)
    return embeddings, model  # Return embeddings and model

# Function to retrieve relevant text based on cosine similarity
def retrieve_relevant_text(query, text_list, embeddings, model):
    # Vectorize the query
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Compute cosine similarities between the query and text embeddings
    similarities = torch.nn.functional.cosine_similarity(query_embedding, embeddings)
    
    # Find the most similar text
    best_match_idx = similarities.argmax().item()
    relevant_text = text_list[best_match_idx]
    
    return relevant_text

# Function to generate response using the LLM
def generate_response_with_llm(query, relevant_text, llm_pipeline):
    # Combine query with the relevant context
    full_prompt = f"Question: {query}\n\nContext: {relevant_text}\n\nAnswer:"
    
    # Generate the response
    response = llm_pipeline(full_prompt)
    
    return response[0]['generated_text']

# Main RAG analyzer function
def rag_analyzer(query, text_list, embeddings, llm_pipeline, model):
    relevant_text = retrieve_relevant_text(query, text_list, embeddings, model)
    response = generate_response_with_llm(query, relevant_text, llm_pipeline)
    return response

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Step 1: Load the extracted text from 'results.txt'
text_list = load_text_from_file('results.txt')
    
    # Step 2: Vectorize the extracted text and get the model
embeddings, model = vectorize_text(text_list)

In [4]:
    # Step 3: Initialize the LLM pipeline (e.g., GPT-2)
llm_pipeline = pipeline("text-generation", model="instruction-pretrain/finance-Llama3-8B")

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00005-of-00007.safetensors:  59%|#####9    | 2.86G/4.83G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/2.57G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
    # Example query
query = "What are the key financial insights from the report?"
    
    # Step 4: Call the RAG system to retrieve the context and generate the response
response = rag_analyzer(query, text_list, embeddings, llm_pipeline, model)
    
    # Print the final response
print(response)