In [1]:
import os
import json
from pymongo import MongoClient
from download import Download
from pdf import PDFProcessor
from summarizer import summarizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Siddharth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# MongoDB setup
client = MongoClient('mongodb://localhost:27017/')
db = client['PDFScratcher']
metadata_collection = db['PDF_Metadata']
summaries_collection = db['Summaries']

# Paths
json_path = r"E://GitHub//AI-Internship-Task//Dataset.json"          # JSON file containing PDF links
download_dir = r"E:/Random Python Scripts/wasserstoff/Dataset"           # Directory to store downloaded PDFs
output_metadata_json = "metadata.json"  # JSON file for storing metadata
output_summaries_json = "summaries.json" # JSON file for storing summaries

In [3]:
# Download PDFs
def download_pdfs():
    print("Starting PDF download process...")
    downloader = Download(json_path, download_dir)
    downloader.download_pdfs()
    print("All PDFs downloaded successfully.")

# Process PDFs to extract metadata and store in MongoDB and JSON
def process_and_store_metadata():
    print("Processing PDFs for metadata...")
    processor = PDFProcessor(client, 'PDFScratcher', 'PDF_Metadata')
    processor.ingest_pdfs(download_dir)

    # Save metadata to JSON
    processor.save_to_json(output_metadata_json)
    print(f"Metadata saved to JSON: {output_metadata_json}")

# Process PDFs to generate summaries and extract keywords
def process_summaries():
    print("Generating summaries and keywords...")
    summarizer.process_pdfs_in_directory(download_dir)

    # Retrieve summaries from MongoDB and save to JSON
    documents = list(summaries_collection.find({}, {"_id": 0}))
    with open(output_summaries_json, 'w') as f:
        json.dump(documents, f, indent=4)
    print(f"Summaries and keywords saved to JSON: {output_summaries_json}")

In [4]:
# Main function to orchestrate the tasks
def main():
    download_pdfs()
    process_and_store_metadata()
    process_summaries()
    print("All tasks completed successfully.")

In [5]:
if __name__ == "__main__":
    main()

Starting PDF download process...




Downloaded: pdf1.pdf




Downloaded: pdf2.pdf




Downloaded: pdf3.pdf




Downloaded: pdf4.pdf




Downloaded: pdf5.pdf




Downloaded: pdf6.pdf




Downloaded: pdf7.pdf




Downloaded: pdf8.pdf




Downloaded: pdf9.pdf




Downloaded: pdf10.pdf




Downloaded: pdf11.pdf




Downloaded: pdf12.pdf




Downloaded: pdf13.pdf




Downloaded: pdf14.pdf




Downloaded: pdf15.pdf




Downloaded: pdf16.pdf




Downloaded: pdf17.pdf




Downloaded: pdf18.pdf
All PDFs downloaded successfully.
Processing PDFs for metadata...
Successfully processed 'pdf1.pdf'.
Successfully processed 'pdf10.pdf'.
Successfully processed 'pdf11.pdf'.
Successfully processed 'pdf12.pdf'.
Successfully processed 'pdf13.pdf'.
Successfully processed 'pdf14.pdf'.
Successfully processed 'pdf15.pdf'.
Successfully processed 'pdf16.pdf'.
Successfully processed 'pdf17.pdf'.
Successfully processed 'pdf18.pdf'.
Successfully processed 'pdf2.pdf'.
Successfully processed 'pdf3.pdf'.
Successfully processed 'pdf4.pdf'.
Successfully processed 'pdf5.pdf'.
Successfully processed 'pdf6.pdf'.
Successfully processed 'pdf7.pdf'.
Successfully processed 'pdf8.pdf'.
Successfully processed 'pdf9.pdf'.
Metadata saved to 'metadata.json'.
Metadata saved to JSON: metadata.json
Generating summaries and keywords...
Reading PDF files from directory: E:/Random Python Scripts/wasserstoff/Dataset
Processing PDF: E:/Random Python Scripts/wasserstoff/Dataset\pdf1.pdf
Extracting tex

  warn(message % (words_count, sentences_count))


Generating summary...
Extracting keywords...
Storing summary and keywords for: E:/Random Python Scripts/wasserstoff/Dataset\pdf16.pdf
Successfully processed: E:/Random Python Scripts/wasserstoff/Dataset\pdf16.pdf
Processing PDF: E:/Random Python Scripts/wasserstoff/Dataset\pdf4.pdf
Extracting text from: E:/Random Python Scripts/wasserstoff/Dataset\pdf4.pdf
Generating summary...
Extracting keywords...
Storing summary and keywords for: E:/Random Python Scripts/wasserstoff/Dataset\pdf10.pdf
Successfully processed: E:/Random Python Scripts/wasserstoff/Dataset\pdf10.pdf
Processing PDF: E:/Random Python Scripts/wasserstoff/Dataset\pdf5.pdf
Extracting text from: E:/Random Python Scripts/wasserstoff/Dataset\pdf5.pdf
Generating summary...
Extracting keywords...
Storing summary and keywords for: E:/Random Python Scripts/wasserstoff/Dataset\pdf5.pdf
Successfully processed: E:/Random Python Scripts/wasserstoff/Dataset\pdf5.pdf
Processing PDF: E:/Random Python Scripts/wasserstoff/Dataset\pdf6.pdf
E

  warn(message % (words_count, sentences_count))


Generating summary...
Extracting keywords...
Storing summary and keywords for: E:/Random Python Scripts/wasserstoff/Dataset\pdf4.pdf
Successfully processed: E:/Random Python Scripts/wasserstoff/Dataset\pdf4.pdf
Generating summary...


  warn(message % (words_count, sentences_count))


Extracting keywords...
Storing summary and keywords for: E:/Random Python Scripts/wasserstoff/Dataset\pdf2.pdf
Successfully processed: E:/Random Python Scripts/wasserstoff/Dataset\pdf2.pdf
Extracting keywords...
Storing summary and keywords for: E:/Random Python Scripts/wasserstoff/Dataset\pdf3.pdf
Successfully processed: E:/Random Python Scripts/wasserstoff/Dataset\pdf3.pdf
Summaries and keywords saved to JSON: summaries.json
All tasks completed successfully.
