In [2]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
from docx import Document
from pptx import Presentation
import re

In [1]:
def count_pages_and_check_figures_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    page_count = len(reader.pages)
    
    text = extract_text(pdf_path).lower()
    figure_pages = []
    figure_count = 0
    table_pages = []
    table_count = 0
    thai_language_pages = []
    
    for i, page in enumerate(reader.pages):
        page_text = page.extract_text().lower()
        
        # Check for figures
        if any(keyword in page_text for keyword in ['figure', 'fig.']):
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in page_text or re.search(r'\btable\b', page_text):
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', page_text):  # Thai Unicode range
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def count_pages_and_check_figures_docx(docx_path):
    doc = Document(docx_path)
    page_count = len(doc.element.xpath('//w:sectPr'))
    
    figure_count = 0
    table_count = 0
    figure_pages = []
    table_pages = []
    thai_language_pages = []
    
    for i, p in enumerate(doc.paragraphs):
        # Check for figures
        if 'figure' in p.text.lower() or 'fig.' in p.text.lower():
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in p.text.lower():
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', p.text):
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def count_pages_and_check_figures_pptx(pptx_path):
    presentation = Presentation(pptx_path)
    slide_count = len(presentation.slides)
    
    figure_count = 0
    table_count = 0
    figure_pages = []
    table_pages = []
    thai_language_pages = []
    
    for i, slide in enumerate(presentation.slides):
        slide_text = "\n".join([shape.text for shape in slide.shapes if shape.has_text_frame]).lower()
        
        # Check for figures
        if 'figure' in slide_text or 'fig.' in slide_text:
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in slide_text or re.search(r'\btable\b', slide_text):
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', slide_text):
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return slide_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def process_files_in_subfolders(root_folder):
    results = []

    for subdir, _, files in os.walk(root_folder):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file.endswith('.pdf'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_pdf(file_path)
                file_type = 'PDF'
            elif file.endswith('.docx'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_docx(file_path)
                file_type = 'Word Document'
            elif file.endswith('.pptx'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_pptx(file_path)
                file_type = 'PowerPoint'
            else:
                continue  # Skip non-PDF/Word/PowerPoint files
            
            results.append({
                'file_name': file,
                'file_type': file_type,
                'page_count': page_count,
                'has_figures': has_figures,
                'figure_count': figure_count,
                'figure_pages': figure_pages,
                'has_tables': has_tables,
                'table_count': table_count,
                'table_pages': table_pages,
                'has_thai_language': has_thai_language,
                'thai_language_pages': thai_language_pages
            })

    return results

def main():
    root_folder = r'C:\Users\(Satang)ChanikarnNik\OneDrive - STelligence Co., Ltd\Documents\GitHub\thaioil\Sample_Doc'  # Replace with your root folder path

    file_results = process_files_in_subfolders(root_folder)
    
    df = pd.DataFrame(file_results)
    
    df.to_csv('results.csv', index=False)

if __name__ == "__main__":
    main()


KeyboardInterrupt: 

In [11]:
def count_pages_and_check_figures_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    page_count = len(reader.pages)
    
    text = extract_text(pdf_path).lower()
    figure_pages = []
    figure_count = 0
    table_pages = []
    table_count = 0
    thai_language_pages = []
    
    for i, page in enumerate(reader.pages):
        page_text = page.extract_text().lower()
        
        # Check for figures
        if any(keyword in page_text for keyword in ['figure', 'fig.']):
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in page_text or re.search(r'\btable\b', page_text):
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', page_text):  # Thai Unicode range
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def count_pages_and_check_figures_docx(docx_path):
    doc = Document(docx_path)
    page_count = len(doc.element.xpath('//w:sectPr'))
    
    figure_count = 0
    table_count = 0
    figure_pages = []
    table_pages = []
    thai_language_pages = []
    
    for i, p in enumerate(doc.paragraphs):
        # Check for figures
        if 'figure' in p.text.lower() or 'fig.' in p.text.lower():
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in p.text.lower():
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', p.text):
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def count_pages_and_check_figures_pptx(pptx_path):
    presentation = Presentation(pptx_path)
    slide_count = len(presentation.slides)
    
    figure_count = 0
    table_count = 0
    figure_pages = []
    table_pages = []
    thai_language_pages = []
    
    for i, slide in enumerate(presentation.slides):
        slide_text = "\n".join([shape.text for shape in slide.shapes if shape.has_text_frame]).lower()
        
        # Check for figures
        if 'figure' in slide_text or 'fig.' in slide_text:
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in slide_text or re.search(r'\btable\b', slide_text):
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', slide_text):
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return slide_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def process_files_in_subfolders(root_folder):
    results = []

    for subdir, _, files in os.walk(root_folder):
        subfolder_name = os.path.basename(subdir)  # Get the name of the subfolder
        for file in files:
            file_path = os.path.join(subdir, file)
            if file.endswith('.pdf'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_pdf(file_path)
                file_type = 'PDF'
            elif file.endswith('.docx'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_docx(file_path)
                file_type = 'Word Document (docx)'
            elif file.endswith('.doc'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_docx(file_path)
                file_type = 'Word Document (doc)'
            elif file.endswith('.pptx'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_pptx(file_path)
                file_type = 'PowerPoint'
            else:
                continue  # Skip non-PDF/Word/PowerPoint files
            
            results.append({
                'subfolder_name': subfolder_name,
                'file_name': file,
                'file_type': file_type,
                'page_count': page_count,
                'has_figures': has_figures,
                'figure_count': figure_count,
                'figure_pages': figure_pages,
                'has_tables': has_tables,
                'table_count': table_count,
                'table_pages': table_pages,
                'has_thai_language': has_thai_language,
                'thai_language_pages': thai_language_pages
            })

    return results

def main():
    folder = 'Sample_Doc'
    root_folder = fr'C:\Users\(Satang)ChanikarnNik\OneDrive - STelligence Co., Ltd\Documents\GitHub\thaioil\{folder}'

    file_results = process_files_in_subfolders(root_folder)
    
    df = pd.DataFrame(file_results)
    
    
    df.to_csv(f'results_{folder}.csv', index=False)

if __name__ == "__main__":
    main()


In [None]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
from docx import Document
from pptx import Presentation

def count_pages_and_check_figures_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    page_count = len(reader.pages)
    
    text = extract_text(pdf_path).lower()
    figure_pages = []
    figure_count = 0
    
    for i, page in enumerate(reader.pages):
        page_text = page.extract_text().lower()
        if any(keyword in page_text for keyword in ['figure', 'fig.']):
            figure_pages.append(i + 1)
            figure_count += 1
    
    has_figures = figure_count > 0
    
    return page_count, has_figures, figure_count, figure_pages

def count_pages_and_check_figures_docx(docx_path):
    doc = Document(docx_path)
    page_count = len(doc.element.xpath('//w:sectPr'))
    figure_count = sum(1 for p in doc.paragraphs if 'figure' in p.text.lower() or 'fig.' in p.text.lower())
    has_figures = figure_count > 0
    figure_pages = []  # Not applicable in the same way as PDFs, but we can list paragraph numbers
    
    if has_figures:
        for i, p in enumerate(doc.paragraphs):
            if 'figure' in p.text.lower() or 'fig.' in p.text.lower():
                figure_pages.append(i + 1)
    
    return page_count, has_figures, figure_count, figure_pages

def count_pages_and_check_figures_pptx(pptx_path):
    presentation = Presentation(pptx_path)
    slide_count = len(presentation.slides)
    figure_count = sum(1 for slide in presentation.slides for shape in slide.shapes if shape.has_text_frame and 'figure' in shape.text.lower() or 'fig.' in shape.text.lower())
    has_figures = figure_count > 0
    figure_pages = []  # Slides with figures
    
    if has_figures:
        for i, slide in enumerate(presentation.slides):
            for shape in slide.shapes:
                if shape.has_text_frame and ('figure' in shape.text.lower() or 'fig.' in shape.text.lower()):
                    figure_pages.append(i + 1)
                    break  # Stop after finding the first figure in the slide
    
    return slide_count, has_figures, figure_count, figure_pages

def process_files_in_subfolders(root_folder):
    results = []

    for subdir, _, files in os.walk(root_folder):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file.endswith('.pdf'):
                page_count, has_figures, figure_count, figure_pages = count_pages_and_check_figures_pdf(file_path)
                file_type = 'PDF'
            elif file.endswith('.docx'):
                page_count, has_figures, figure_count, figure_pages = count_pages_and_check_figures_docx(file_path)
                file_type = 'Word Document'
            elif file.endswith('.pptx'):
                page_count, has_figures, figure_count, figure_pages = count_pages_and_check_figures_pptx(file_path)
                file_type = 'PowerPoint'
            else:
                continue  # Skip non-PDF/Word/PowerPoint files
            
            results.append({
                'file_name': file,
                'file_path': file_path,
                'file_type': file_type,
                'page_count': page_count,
                'has_figures': has_figures,
                'figure_count': figure_count,
                'figure_pages': figure_pages
            })

    return results

def main():
    root_folder = 'your_root_folder_path_here'  # Replace with your root folder path
    file_results = process_files_in_subfolders(root_folder)
    
    # Convert the results to a DataFrame
    df = pd.DataFrame(file_results)
    
    # Display the DataFrame
    print(df)
    
    # Save the DataFrame to a CSV file
    df.to_csv('document_analysis_results.csv', index=False)

if __name__ == "__main__":
    main()


In [3]:
def count_pages_and_check_figures_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    page_count = len(reader.pages)
    
    text = extract_text(pdf_path).lower()
    figure_pages = []
    figure_count = 0
    table_pages = []
    table_count = 0
    thai_language_pages = []
    
    for i, page in enumerate(reader.pages):
        page_text = page.extract_text().lower()
        
        # Check for figures
        if any(keyword in page_text for keyword in ['figure', 'fig.']):
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in page_text or re.search(r'\btable\b', page_text):
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', page_text):  # Thai Unicode range
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def count_pages_and_check_figures_docx(docx_path):
    doc = Document(docx_path)
    page_count = len(doc.element.xpath('//w:sectPr'))
    
    figure_count = 0
    table_count = 0
    figure_pages = []
    table_pages = []
    thai_language_pages = []
    
    for i, p in enumerate(doc.paragraphs):
        # Check for figures
        if 'figure' in p.text.lower() or 'fig.' in p.text.lower():
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in p.text.lower():
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', p.text):
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def count_pages_and_check_figures_pptx(pptx_path):
    presentation = Presentation(pptx_path)
    slide_count = len(presentation.slides)
    
    figure_count = 0
    table_count = 0
    figure_pages = []
    table_pages = []
    thai_language_pages = []
    
    for i, slide in enumerate(presentation.slides):
        slide_text = "\n".join([shape.text for shape in slide.shapes if shape.has_text_frame]).lower()
        
        # Check for figures
        if 'figure' in slide_text or 'fig.' in slide_text:
            figure_pages.append(i + 1)
            figure_count += 1
        
        # Check for tables
        if 'table' in slide_text or re.search(r'\btable\b', slide_text):
            table_pages.append(i + 1)
            table_count += 1
        
        # Check for Thai language
        if re.search(r'[\u0E00-\u0E7F]', slide_text):
            thai_language_pages.append(i + 1)
    
    has_figures = figure_count > 0
    has_tables = table_count > 0
    has_thai_language = len(thai_language_pages) > 0
    
    return slide_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages

def process_files_in_subfolders(root_folder):
    results = []
    total_files = 0
    
    # First, count the total number of files to process for progress tracking
    for subdir, _, files in os.walk(root_folder):
        total_files += len(files)
    
    processed_files = 0
    
    for subdir, _, files in os.walk(root_folder):
        subfolder_name = os.path.basename(subdir)  # Get the name of the subfolder
        for file in files:
            file_path = os.path.join(subdir, file)
            processed_files += 1
            print(f"Processing {file} ({processed_files}/{total_files})...")
            
            if file.endswith('.pdf'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_pdf(file_path)
                file_type = 'PDF'
            elif file.endswith('.docx'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_docx(file_path)
                file_type = 'Word Document'
            elif file.endswith('.pptx'):
                page_count, has_figures, figure_count, figure_pages, has_tables, table_count, table_pages, has_thai_language, thai_language_pages = count_pages_and_check_figures_pptx(file_path)
                file_type = 'PowerPoint'
            else:
                print(f"Skipping {file} (Unsupported file type)")
                continue  # Skip non-PDF/Word/PowerPoint files
            
            results.append({
                'file_name': file,
                'subfolder_name': subfolder_name,
                'file_type': file_type,
                'page_count': page_count,
                'has_figures': has_figures,
                'figure_count': figure_count,
                'figure_pages': figure_pages,
                'has_tables': has_tables,
                'table_count': table_count,
                'table_pages': table_pages,
                'has_thai_language': has_thai_language,
                'thai_language_pages': thai_language_pages
            })

    return results

def main():
    root_folder = r'C:\Users\(Satang)ChanikarnNik\OneDrive - STelligence Co., Ltd\Documents\GitHub\thaioil\Sample_Doc'  # Replace with your root folder path

    print(f"Starting the document analysis in {root_folder}")
    file_results = process_files_in_subfolders(root_folder)
    
    # Convert the results to a DataFrame
    df = pd.DataFrame(file_results)
    
    # Display the DataFrame
    print(df)
    
    # Save the DataFrame to a CSV file
    df.to_csv('results_analysis.csv', index=False)
    print(f"Analysis complete! Results saved to 'results.csv'.")

if __name__ == "__main__":
    main()

Starting the document analysis in C:\Users\(Satang)ChanikarnNik\OneDrive - STelligence Co., Ltd\Documents\GitHub\thaioil\Sample_Doc
Processing API 1004_120.40-095.pdf (1/33)...
Processing ASME B16.5-2003 Pipe Flanges-Flanged Fittings NPS 12 Through NPS24 MetricInch Standard_120.70-082.pdf (2/33)...
Processing ASTM Cetane Method For Rating Diesel Fuels 1963_120.80-006.pdf (3/33)...
Processing IEC 112_120.180-158.pdf (4/33)...
Processing R1713001-M7008-24-8301-1502-REV0.pdf (5/33)...
Processing R1814001-M0020-75-0001-0084-rev1.pdf (6/33)...
Processing EN-QTD-45_Integrity Operating Windows Control Plan for U-7500.docx (7/33)...
Processing Enqpr04_Integrity Operating Windows Procedure.pdf (8/33)...
Processing Enqpr05_Spare Part Management Procedure.pdf (9/33)...
Processing MPOF-QWI-750101_START UP SHELL SULFOLANE UNIT (U-7500) WORK INSTRUCTION.docx (10/33)...
Processing MPOF-QWI-750201 NORMAL SHUTDOWN SULFOLANE UNIT U7500 WORK INSTRUCTION.docx (11/33)...
Processing Mpofqpr03_Product Storag