In [None]:
from parse_documents import *

# filepath = 'upload/Jean-Baptiste_Perronneau_-_Magdaleine_Pinceloup_de_la_Grange,_née_de_Parseval.jpg'  # Replace with the path of your file
# filepath = "upload/World_Energy_By_Country_And_Region_1965_to_2023.csv"
filepath = 'upload/Diabetic-retinopathy-identification-using-parallel-convo_2023_Expert-Systems.pdf'
parse_document(filepath)



In [7]:
from tika import parser

def parse_document(filepath):
    try:
        parsed_document = parser.from_file(filepath)
        
        # Check if parsing was successful
        if 'status' in parsed_document and parsed_document['status'] == 200:
            print(f"Parsing successful: {parsed_document['status']}")
        else:
            print(f"Parsing failed with status: {parsed_document.get('status')}")
            return
        
        # Print the type and keys of the parsed document
        print(type(parsed_document))
        print(parsed_document.keys())

        # Extract and print metadata if available
        metadata = parsed_document.get('metadata', {})
        if metadata:
            print("Metadata:")
            print(metadata)
        else:
            print("No metadata found.")
        
        # Print content if needed (commented to avoid large outputs)
        # print(parsed_document.get('content', 'No content available'))
        
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
parse_document("upload/Jean-Baptiste_Perronneau_-_Magdaleine_Pinceloup_de_la_Grange,_née_de_Parseval.jpg")


Parsing successful: 200
<class 'dict'>
dict_keys(['metadata', 'content', 'status'])
Metadata:
{'Number of Tables': '4 Huffman tables', 'File Modified Date': 'Mon Sep 09 22:50:54 +00:00 2024', 'Compression Type': 'Baseline', 'Data Precision': '8 bits', 'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.image.JpegParser'], 'Number of Components': '3', 'tiff:ImageLength': '623', 'resourceName': "b'Jean-Baptiste_Perronneau_-_Magdaleine_Pinceloup_de_la_Grange,_n\\xc3\\xa9e_de_Parseval.jpg'", 'Component 2': 'Cb component: Quantization table 1, Sampling factors 1 horiz/1 vert', 'Component 1': 'Y component: Quantization table 0, Sampling factors 2 horiz/2 vert', 'Image Height': '623 pixels', 'Image Width': '500 pixels', 'File Size': '72495 bytes', 'Component 3': 'Cr component: Quantization table 1, Sampling factors 1 horiz/1 vert', 'X-TIKA:Parsed-By': ['org.apache.tika.parser.DefaultParser', 'org.apache.tika.parser.image.JpegParser'], 'X-TIKA:parse_ti

In [5]:
from tika import parser

def parse_document(filepath):
    try:
        parsed_document = parser.from_file(filepath)
        
        # Check if parsing was successful
        if 'status' in parsed_document and parsed_document['status'] == 200:
            print(f"Parsing successful: {parsed_document['status']}")
        else:
            print(f"Parsing failed with status: {parsed_document.get('status')}")
            return
        
        # Extract and print list of keys in the parsed document
        document_keys = list(parsed_document.keys())
        print("Keys in the parsed document:")
        print(document_keys)

        # Extract metadata keys if available
        metadata = parsed_document.get('metadata', {})
        if metadata:
            metadata_keys = list(metadata.keys())
            print("Keys in metadata:")
            print(metadata_keys)
        else:
            print("No metadata found.")

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
parse_document("upload/Jean-Baptiste_Perronneau_-_Magdaleine_Pinceloup_de_la_Grange,_née_de_Parseval.jpg")


Parsing successful: 200
Keys in the parsed document:
['metadata', 'content', 'status']
Keys in metadata:
['Number of Tables', 'File Modified Date', 'Compression Type', 'Data Precision', 'X-TIKA:Parsed-By-Full-Set', 'Number of Components', 'tiff:ImageLength', 'resourceName', 'Component 2', 'Component 1', 'Image Height', 'Image Width', 'File Size', 'Component 3', 'X-TIKA:Parsed-By', 'X-TIKA:parse_time_millis', 'X-TIKA:embedded_depth', 'File Name', 'Content-Length', 'tiff:BitsPerSample', 'tiff:ImageWidth', 'Content-Type']


In [10]:
import os
import json
from tika import parser
from tempfile import gettempdir

def parse_documents_in_directory(directory_path):
    temp_dir = "tmp" # Get the system's temporary directory
    print(f"Saving metadata files in temporary directory: {temp_dir}")

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        
        # Check if it's a file and not a directory
        if os.path.isfile(file_path):
            try:
                parsed_document = parser.from_file(file_path)
                
                # Check if parsing was successful
                if 'status' in parsed_document and parsed_document['status'] == 200:
                    metadata = parsed_document.get('metadata', {})
                    
                    # Save metadata as JSON
                    if metadata:
                        json_filename = f"{os.path.splitext(filename)[0]}.json"
                        json_path = os.path.join(temp_dir, json_filename)

                        with open(json_path, 'w') as json_file:
                            json.dump(metadata, json_file, indent=4)
                        
                        print(f"Metadata saved for '{filename}' as '{json_filename}'")
                    else:
                        print(f"No metadata found for '{filename}'")
                else:
                    print(f"Parsing failed for '{filename}' with status: {parsed_document.get('status')}")
            
            except Exception as e:
                print(f"An error occurred with file '{filename}': {e}")

# Example usage
parse_documents_in_directory("upload")


Saving metadata files in temporary directory: tmp
Metadata saved for 'Diabetic-retinopathy-identification-using-parallel-convo_2023_Expert-Systems.pdf' as 'Diabetic-retinopathy-identification-using-parallel-convo_2023_Expert-Systems.json'
Metadata saved for 'Munafò et al. - 2022 - The reproducibility debate is an opportunity, not .pdf' as 'Munafò et al. - 2022 - The reproducibility debate is an opportunity, not .json'
Metadata saved for 'assets.json' as 'assets.json'
Metadata saved for 'hello_world.py' as 'hello_world.json'
Metadata saved for 'bitcoin.pdf' as 'bitcoin.json'
Metadata saved for 'assets.csv' as 'assets.json'
Metadata saved for 'COVID19-MLSF--A-multi-task-learning-based-stock-market_2023_Expert-Systems-w.pdf' as 'COVID19-MLSF--A-multi-task-learning-based-stock-market_2023_Expert-Systems-w.json'
Metadata saved for 'Deep-learning-in-insurance--Accuracy-and-model-int_2023_Expert-Systems-with-.pdf' as 'Deep-learning-in-insurance--Accuracy-and-model-int_2023_Expert-Systems-with