In [1]:
print("rag")

rag


In [1]:
import os
import re
from PyPDF2 import PdfReader
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import os
import logging
from datetime import datetime


def clean_text(text):
    """Clean the text by removing non-alphabetic characters and converting to lowercase."""
    text = re.sub(r'\W+', ' ', text)
    return text.lower()

def sentence_tokenize(text):
    """Tokenize text into sentences using basic punctuation rules."""
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if len(sentence) > 10]  # Ignore very short sentences

def word_tokenize(text):
    """Tokenize text into words."""
    words = re.findall(r'\b\w+\b', text)
    return words

def score_sentences(text):
    """Score sentences based on word frequency and other features."""
    sentences = sentence_tokenize(text)
    words = word_tokenize(text) 
    word_freq = Counter(words)

    sentence_scores = {}
    for sentence in sentences:
        sentence_word_count = word_tokenize(sentence)
        score = sum(word_freq[word] for word in sentence_word_count)
        sentence_scores[sentence] = score

    return sentence_scores



def summarize_text(text, num_sentences=5):
    """Summarize text by extracting the highest-scoring sentences, with a word limit."""
    # Tokenize the input text to determine its word count
    total_word_count = len(word_tokenize(text))
    
    # Set word limit based on the length of the input text
    if total_word_count > 4000:
        max_words = 100
    elif 2000 <= total_word_count <= 4000:
        max_words = 80
    else:
        max_words = 50

    # Score sentences based on their importance
    sentence_scores = score_sentences(text)
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)

    
    # Add sentences while keeping total word count under max_words
    summary_sentences = []
    word_count = 0
    for sentence in sorted_sentences:
        sentence_word_count = len(word_tokenize(sentence))

        # If a sentence is longer than the remaining word count, truncate it
        if sentence_word_count > max_words - word_count:
            remaining_words = max_words - word_count
            truncated_sentence = ' '.join(word_tokenize(sentence)[:remaining_words])
            summary_sentences.append(truncated_sentence)
            break
        else:
            summary_sentences.append(sentence)
            word_count += sentence_word_count
            

        # Stop adding sentences if the word limit is reached
        if word_count >= max_words:
            break
    
    return ' '.join(summary_sentences)

    


def extract_keywords(text, num_keywords=5):
    """Extracts top keywords using TF-IDF"""
    try:
        # Initialize the TF-IDF vectorizer
        tfidf = TfidfVectorizer(max_features=num_keywords, stop_words='english')
        tfidf_matrix = tfidf.fit_transform([text])
        
        # Get feature names (the keywords) and their TF-IDF scores
        feature_names = tfidf.get_feature_names_out()
        
        return list(feature_names)
    
    except Exception as e:
        logging.error(f"Error extracting keywords: {str(e)}")
        return ["No Keywords Found"]

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using PyPDF2."""
    try:
        text=""
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            # text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:  # Only add non-empty text
                    text += page_text
                    
            if not text.strip():  # Check if no text was extracted
                logging.warning(f"No text extracted from {pdf_path}")
        # print("The text is: ",text)
        return text  # Return stripped text to avoid empty results
    except Exception as e:
        logging.error(f"Error reading {pdf_path}: {str(e)}")
        return ""

def process_pdf_for_summary_and_keywords(pdf_path, num_sentences=5, top_n=10):
    """Process a single PDF for summarization and keyword extraction."""
    pdf_text = extract_text_from_pdf(pdf_path)
    # print("after the text is extracted::   ",pdf_text)

     # Check if the extracted text is valid before proceeding
    if not pdf_text:  # If no text was extracted, return an error message
        logging.error(f"Failed to extract text from {pdf_path}")
        return None, None
    
    cleaned_text = clean_text(pdf_text)
    

    if not cleaned_text:
        logging.error(f"Cleand text is empty for {pdf_path}")
        return None, None
    summary = summarize_text(cleaned_text, num_sentences)
 
    
    keywords = extract_keywords(cleaned_text, top_n)
    
    return summary, keywords

def process_multiple_pdfs_for_summary_and_keywords(folder_path, max_workers=4):
    """Process multiple PDFs in parallel for summarization and keyword extraction."""
    pdf_summaries_keywords = {}

    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_pdf = {}
        for pdf_file in pdf_files:
            pdf_path = os.path.join(folder_path, pdf_file)
            future = executor.submit(process_pdf_for_summary_and_keywords, pdf_path)
            future_to_pdf[future] = pdf_file

        for future in as_completed(future_to_pdf):
            pdf_name = future_to_pdf[future]
            try:
                summary, keywords = future.result()
                if summary is None and keywords is None:
                    pdf_summaries_keywords[pdf_name] = {"error": "Failed to extract text"}
                else:
                    pdf_summaries_keywords[pdf_name] = {
                        "summary": summary,
                        "keywords": keywords
                    }
            except Exception as e:
                pdf_summaries_keywords[pdf_name] = {
                    "error": str(e)
                }

    return pdf_summaries_keywords




#-------------------------------------------------------------------


def process_pdf(pdf_name, folder_path):
    """Simulates processing a PDF by extracting metadata, summarizing, and updating MongoDB."""
    start_time = time.time()  # Start the timer for this document

    # Store initial metadata
    pdf_id = store_metadata_in_mongodb(pdf_name, folder_path)
    
    if pdf_id is None:
        logging.error(f"Failed to store metadata for {pdf_name}")
        return
    
       # Extract text, summarize and get keywords for the current PDF
    pdf_path = os.path.join(folder_path, pdf_name)
    summary, keywords = process_pdf_for_summary_and_keywords(pdf_path)
    
    if summary is None and keywords is None:
        logging.error(f"Failed to extract summary and keywords for {pdf_name}")
        return

    # Simulate document processing (e.g., text extraction, summarization, keyword extraction)
    # In real code, replace this with actual logic
    # summary = f"Summary of {pdf_name}"  # Placeholder for actual summary
    # summary = result['summary']
    # keywords= result['keywords']
    # keywords = ["keyword1", "keyword2"]  # Placeholder for actual keywords

    # Update MongoDB with summary and keywords
    update_mongodb_with_summary_and_keywords(pdf_id, summary, keywords)

    # Measure time taken for this document
    time_taken = time.time() - start_time
    logging.info(f"Processed {pdf_name} in {time_taken:.2f} seconds")

    return time_taken



def process_pdfs_concurrently(folder_path, max_workers=5):
    """Process multiple PDFs concurrently."""
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    total_start_time = time.time()  # Start time for all PDFs
    total_time_taken = 0
    futures = []

    # Use ThreadPoolExecutor for concurrency
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for pdf_file in pdf_files:
            # Submit each PDF to the executor
            future = executor.submit(process_pdf, pdf_file, folder_path)
            futures.append(future)

        # Wait for all futures to complete
        for future in as_completed(futures):
            try:
                time_taken = future.result()
                total_time_taken += time_taken
            except Exception as e:
                logging.error(f"Error processing PDF: {str(e)}")

    # Calculate total time for all PDFs
    total_time = time.time() - total_start_time
    avg_time_per_doc = total_time_taken / len(pdf_files) if pdf_files else 0

    logging.info(f"Total processing time for all PDFs: {total_time:.2f} seconds")
    logging.info(f"Average time per document: {avg_time_per_doc:.2f} seconds")
    logging.info(f"Processed {len(pdf_files)} documents in parallel")
    
    # Return performance metrics
    return {
        "total_time": total_time,
        "average_time_per_doc": avg_time_per_doc,
        "total_documents": len(pdf_files)
    }


#----------------------------------------------------------------------


# Folder path where the PDF files are stored
folder_path = 'D:\AI_ML\RAG\data'

# Process all PDFs in the folder for summarization and keyword extraction
all_pdf_results = process_multiple_pdfs_for_summary_and_keywords(folder_path)
                            
# Print results for each PDF
for pdf_name, result in all_pdf_results.items():
    if "error" in result:
        print(f"Error processing {pdf_name}: {result['error']}")
    else:
        print(f"Summary for {pdf_name}:\n{result['summary']}\n")
        print(f"Keywords for {pdf_name}: {', '.join(result['keywords'])}\n")
        print("="*80 + "\n")


Summary for attention_removed.pdf:
provided proper attribution is provided google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works attention is all you need ashish vaswani google brain avaswani google comnoam shazeer google brain noam google comniki parmar google research nikip google comjakob

Keywords for attention_removed.pdf: attention, google, input, model, models, neural, output, positions, sequence, transformer


Summary for ML.pdf:
see discussions st ats and author pr ofiles f or this public ation at https www researchgate ne t public ation 339031674 getting started with machine learning ml article februar y 2020 citation 1reads 3 680 1 author rukshan manor athna univ ersity of colombo 14 publica tions 7 citations see profile all c ontent f ollo wing this p age was uplo aded b y rukshan manor athna on 04 f ebruar y 2020 the user has r equest

Keywords for ML.pdf: algorithm, algorithms, data, learning, m

In [2]:

import os
import logging
from datetime import datetime
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


# MongoDB connection string with credentials
connection_string = os.getenv("MONGO_URL")

# Connect to the MongoDB client
client = MongoClient(connection_string)

# Select the database and collection
db = client["pdf_database"]
collection = db["pdf_documents"]

# Setting up logging for error tracking
logging.basicConfig(filename='pdf_processing_errors.log', level=logging.INFO)

# Check MongoDB connection
try:
    client.admin.command('ping')
    
    logging.info("MongoDB connection successful!")
except Exception as e:
    logging.error(f"Error connecting to MongoDB: {str(e)}")
    exit(1)

def store_metadata_in_mongodb(pdf_name, folder_path):
    """Store initial metadata about the PDF file in MongoDB."""
    pdf_path = os.path.join(folder_path, pdf_name)
    try:
        file_stats = os.stat(pdf_path)
        pdf_metadata = {
            "pdf_name": pdf_name,
            "file_size": file_stats.st_size,
            "upload_date": datetime.now(),
            "status": "pending",
            "summary": None,
            "keywords": None
        }
        result = collection.insert_one(pdf_metadata)
        logging.info(f"Successfully stored metadata for {pdf_name} in MongoDB with ID: {result.inserted_id}")
        return result.inserted_id
    except Exception as e:
        logging.error(f"Error storing metadata for {pdf_name}: {str(e)}")
        return None

def update_mongodb_with_summary_and_keywords(pdf_id, summary, keywords):
    """Update MongoDB document with the summary and keywords."""
    try:
        collection.update_one(
            {"_id": pdf_id},
            {"$set": {
                "summary": summary,
                "keywords": keywords,
                "status": "processed"
            }}
        )
        logging.info(f"Successfully updated MongoDB document with ID: {pdf_id}")
    except Exception as e:
        logging.error(f"Error updating MongoDB for document ID {pdf_id}: {str(e)}")

In [3]:
data =process_pdfs_concurrently(folder_path, max_workers=5)
print(data)

{'total_time': 2.601963758468628, 'average_time_per_doc': 1.6288970947265624, 'total_documents': 5}
