In [1]:
#pip install pymongo PyPDF2 pytesseract pdf2image pillow pymongo
# sudo apt-get install tesseract-ocr (terminal)

In [2]:
import os
import pytesseract
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
from pymongo import MongoClient
from PIL import Image
import re

In [3]:
# Setup MongoDB connection
client = MongoClient('localhost', 27017)
db = client['PDFs']
original_collection = db['pdf_contents']
cleaned_collection = db['cleaned_pdf_contents']
# Path to the folder containing PDF files
pdf_folder_path = 'pdf_data'

In [4]:
# Path to Tesseract executable (for Linux)
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

In [5]:
def extract_text_from_pdf(pdf_path):
    try:
        # Attempt to extract text using PdfReader
        reader = PdfReader(pdf_path)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path} using PdfReader: {e}")
        return None

def extract_text_from_image(pdf_path):
    try:
        # Convert PDF pages to images
        images = convert_from_path(pdf_path)
        text = ''
        for image in images:
            text += pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error extracting text from images in {pdf_path} using Tesseract: {e}")
        return None

In [6]:
def clean_text(content):
    # Replace special characters
    replacements = {
        '\u00e9': 'é', '\u00e0': 'à', '\u00e8': 'è', '\u00f4': 'ô', '\u00e7': 'ç',
        '\u00fb': 'û', '\u00ea': 'ê', '\u00e2': 'â', '\u00ef': 'ï', '\u00ee': 'î',
        '\u00e1': 'á', '\u00f3': 'ó', '\u00fa': 'ú', '\u00f1': 'ñ', '\u00e4': 'ä',
        '\u00f6': 'ö', '\u00fc': 'ü', '\u00e3': 'ã', '\u00f5': 'õ'
    }
    for orig, repl in replacements.items():
        content = content.replace(orig, repl)
    
    # Remove special characters (except French accents)
    content = re.sub(r'[^\w\s\'éàèôçûêâïîáóúñäöüãõ]', '', content)
    
    # Replace multiple spaces with a single space
    content = re.sub(r'\s+', ' ', content)
    
    # Remove leading and trailing spaces
    content = content.strip()
    
    return content

def clean_title(title):
    # Clean the title similarly to the content
    title = re.sub(r'[^\w\s\'éàèôçûêâïîáóúñäöüãõ]', '', title)
    title = re.sub(r'\s+', ' ', title)
    title = title.strip()
    return title


In [7]:
def process_pdf_files(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, file_name)
            print(f"Processing {pdf_path}...")
            
            text = extract_text_from_pdf(pdf_path)
            if not text:  # If PdfReader failed or returned empty
                text = extract_text_from_image(pdf_path)
            
            if text:
                # Ensure proper encoding
                text = text.encode('utf-8', 'replace').decode('utf-8')
                file_name_encoded = file_name.encode('utf-8', 'replace').decode('utf-8')
                
                # Store in MongoDB
                pdf_data = {
                    'title': file_name_encoded,
                    'content': text,
                    'metadata': None
                }
                original_collection.insert_one(pdf_data)
                print(f"Stored {file_name} in MongoDB.")
            else:
                print(f"Failed to extract content from {file_name}.")

In [8]:
def process_documents():
    documents = original_collection.find()
    
    for document in documents:
        content = document['content']
        title = document['title']
        
        # Clean and preprocess the content and title
        cleaned_content = clean_text(content)
        cleaned_title = clean_title(title)
        
        # Create the cleaned document
        cleaned_document = {
            'title': cleaned_title,
            'content': cleaned_content,
            'metadata': None
        }
        
        # Store the cleaned document in the new collection
        cleaned_collection.insert_one(cleaned_document)
        
        print(f"Stored cleaned document with title: {cleaned_title}")


In [9]:
# Run the scripts
process_pdf_files(pdf_folder_path)
process_documents()

Processing pdf_data/Atelier5-1 Spark.pdf...
Stored Atelier5-1 Spark.pdf in MongoDB.
Processing pdf_data/Artificial_Intelligence_of_Things_AIoT_i.pdf...
Stored Artificial_Intelligence_of_Things_AIoT_i.pdf in MongoDB.
Processing pdf_data/Installation Cloudera QuikStrats sur VM Virtual Box.pdf...
Stored Installation Cloudera QuikStrats sur VM Virtual Box.pdf in MongoDB.
Processing pdf_data/Cours 1 Architecture_Big Data Fondements de BIG DATA.pdf...
Stored Cours 1 Architecture_Big Data Fondements de BIG DATA.pdf in MongoDB.
Processing pdf_data/KMeans(Données 2D).pdf...
Stored KMeans(Données 2D).pdf in MongoDB.
Processing pdf_data/Atelier YARN.pdf...
Stored Atelier YARN.pdf in MongoDB.
Processing pdf_data/Atelier Hive.pdf...
Stored Atelier Hive.pdf in MongoDB.
Processing pdf_data/Smart_Transport_and_Logistics_a_Node-RED_implement.pdf...
Stored Smart_Transport_and_Logistics_a_Node-RED_implement.pdf in MongoDB.
Processing pdf_data/Cours 4 Architecture_Big Data Hadoop2 YARN.pdf...
Stored Cours

In [10]:
import json

def fetch_documents_and_store_in_json(collection, output_file):
    documents = collection.find({}, {'_id': 0, 'title': 1, 'content': 1, 'metadata': 1})
    
    document_list = list(documents)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(document_list, f, ensure_ascii=False, indent=4)
    
    print(f"Stored documents in {output_file}")

# Define the output JSON file path
output_file_path = 'pdf_data/pdf_contents.json'

# Fetch documents and store in JSON file
fetch_documents_and_store_in_json(original_collection, output_file_path)

Stored documents in pdf_data/pdf_contents.json
