In [12]:
import fitz  # PyMuPDF
import spacy
import re
import pandas as pd
import nltk
import os
from nltk.tokenize import sent_tokenize, word_tokenize

# Initialize spaCy and NLTK
nlp = spacy.load("en_core_web_sm")
nltk.download("punkt")

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    
    data = {'File Name': [], 'Page_number': [], 'Paragraphs': [], 'Num_Flag': [], 'num_list': [], 'Keywords': []}
    
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text = page.get_text()
        
        # Split the text into paragraphs
        paragraphs = re.split(r'\n\n+', text)
        
        for paragraph in paragraphs:
            # Clean the paragraph by removing extra white spaces, newlines, and other unwanted characters
            paragraph = re.sub(r'\s+', ' ', paragraph).strip()
            
            # Store the paragraph in the DataFrame
            data['File Name'].append(pdf_path)
            data['Page_number'].append(page_num)
            data['Paragraphs'].append(paragraph)
            
            # Check for numerical information
            num_flag = False
            num_list = re.findall(r'\b\d+\b', paragraph)  # Extract all numerical values
            if num_list:
                num_flag = True
            
            data['Num_Flag'].append(num_flag)
            data['num_list'].append(','.join(num_list))  # Join numerical values into a single string
            
            # Extract keywords using spaCy (modify this as needed)
            doc = nlp(paragraph)
            keywords = [ent.text for ent in doc.ents]
            data['Keywords'].append(','.join(keywords))  # Join keywords into a single string
    
    pdf_document.close()
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)
    
    return df

# Process all PDF files in a folder
pdf_folder = r"C:\Users\Admin\Downloads\null\"
pdf_files = [os.path.join(pdf_folder, file) for file in os.listdir(pdf_folder) if file.endswith('.pdf')]

result_df = pd.DataFrame()

for pdf_path in pdf_files:
    df = extract_text_from_pdf(pdf_path)
    result_df = pd.concat([result_df, df], ignore_index=True)

# Data Cleaning
result_df['Paragraphs'] = result_df['Paragraphs'].apply(lambda x: ' '.join(word_tokenize(x)))

# Export the processed data to a CSV file
result_df.to_csv('processed_data_1.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
