In [3]:
import pytesseract
from PIL import Image
import re
from dateutil import parser
import os
import json

In [7]:
def get_date(img_path):
    try:
        img = Image.open(img_path)
        ocr_text = pytesseract.image_to_string(img)

        # Confidence
        confidences = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

        # Filter out non-empty confidences and calculate average
        confidence_values = [float(conf) for conf in confidences['conf'] if conf]
        final_confidence = confidence_values[-1]
        
        date_regexes = [
            r'\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b',  # e.g., 01/31/2024 or 1-1-23
            r'\b\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{2,4}\b',  # e.g., 1 Jan 2024
            r'\b\d{4}-\d{2}-\d{2}\b',  # e.g., 2024-01-31
            r'\b\d{2}/\d{2}/\d{4}\b',  # e.g., 31/01/2024
            r'\b\d{2}-\d{2}-\d{4}\b',  # e.g., 31-01-2024
            r'\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4} \d{1,2}:\d{2} (?:AM|PM)\b',  # e.g., 01/31/2024 12:30 PM
            r'\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\b',  # e.g., 2024-01-31 12:30:45
        ]

        extracted_dates = []
        for regex in date_regexes:
            matches = re.findall(regex, ocr_text)
            for match in matches:
                extracted_dates.append(match)

        parsed_dates = []
        for date_str in extracted_dates:
            try:
                parsed_date = parser.parse(date_str, fuzzy=True)
                parsed_dates.append(parsed_date)
            except ValueError:
                pass

        formatted_dates = [date.strftime("%d %b %Y %H:%M:%S") for date in parsed_dates]
        print(f"Date {img_path} -> Timestamp: {formatted_dates}")

        final_date = formatted_dates[0] if formatted_dates else None
        return [final_date, final_confidence]

    except Exception as e:
        return f"Error: {str(e)}"

In [8]:
# Dictionary to store the results
results = {}
folder_path = "../example_data/output3/timestamps/"

files = os.listdir(folder_path)
for file in files:
    if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png"):
        image_path = os.path.join(folder_path, file)
        date, confidence = get_date(image_path)
        results[file] = [date, confidence]

# Path to save the JSON file
output_file_path = "../output_json/timestamp_results.json"
with open(output_file_path, "w") as output_file:
    json.dump(results, output_file, indent=4)

print("Extraction completed. Results saved to:", output_file_path)

Date ../example_data/output3/timestamps/1000box_1.jpg -> Timestamp: ['10 Apr 2024 00:00:00', '10 Apr 2024 09:43:00']
Date ../example_data/output3/timestamps/1001box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/1002box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/1003box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/1004box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/1005box_1.jpg -> Timestamp: ['15 Apr 2024 00:00:00', '15 Apr 2024 14:27:00']
Date ../example_data/output3/timestamps/1006box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/1007box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/1008box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/1009box_1.jpg -> Timestamp: []
Date ../example_data/output3/timestamps/100box_1.jpg -> Timestamp: ['08 Apr 2024 00:00:00', '08 Apr 2024 13:27:00']
Date ../example_data/output3/timestamps/1010box_1.jpg -> Timestamp: []
Date ../exa