In [43]:
import pytesseract
from PIL import Image
import re
from dateutil import parser
import cv2
import os
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, decode_predictions

In [56]:
def preprocess_image(img):
    if isinstance(img, str):
        img = cv2.imread(img)  # Read image if it's a file path
        if img is None:
            raise ValueError("Unable to read image from the provided path.")

    img = np.asarray(img)  # Convert image to numpy array

    # Convert to RGB if the image is not already in RGB format
    if len(img.shape) == 2:  # Grayscale image
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    elif img.shape[2] == 1:  # Single-channel image (assuming it's grayscale)
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)

    # Resize image to match MobileNetV2 input size
    img = cv2.resize(img, (224, 224))

    # Preprocess input for MobileNetV2
    img = img / 255.0  # Normalize pixel values to [0, 1]

    return img

In [57]:
date_regex_pattern = [
        r'\d{1,2}[./-]\d{1,2}[./-]\d{2,4}',  # e.g., 01/31/2024
        r'\b(\d{1,2}[-]\d{1,2}[-]\d{2,4})\b',  # e.g., 01-31-2024 or 31-01-2024
        r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{2,4}',  # e.g., 16 Jun 2023
        r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}, \d{2,4}',  # e.g., Jun 16, 2023
    ]

In [58]:
def detect_date_using_ml(img):
    model = MobileNetV2(weights='imagenet')  # Load pre-trained MobileNetV2 model
    
    img = image.img_to_array(img)
    img = preprocess_input(img)
    img = tf.image.resize(img, (224, 224))  # Resize image to match MobileNetV2 input size
    img = tf.expand_dims(img, axis=0)  # Add batch dimension
    
    predictions = model.predict(img)

    # Process predictions to extract date information
    recognized_text = decode_predictions(predictions)  # Convert predictions to human-readable text
    extracted_date = process_recognized_text(recognized_text)
    
    return extracted_date

def process_recognized_text(recognized_text):
    # Assuming recognized_text is a list of tuples (class, description, probability)
    # Extracting the description (text) from the predictions
    text = recognized_text[0][0][1] if recognized_text else ""
    
    # Further processing to extract the date information from the recognized text
    # Example: using regular expressions or other text processing techniques
    extracted_date = extract_date_from_text(text)
    
    return extracted_date

def extract_date_from_text(text):
    # Regular expression patterns for common date formats
    date_patterns = date_regex_pattern

    extracted_dates = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text)
        extracted_dates.extend(matches)

    # You may further process the extracted dates if needed
    return extracted_dates

In [59]:
def get_date(img):
    ocr_text = pytesseract.image_to_string(img)
    img = Image.open(img)
    date_regexes = date_regex_pattern

    hour_regex = r'\d{1,2}'
    minute_regex = r':\d{2}'
    ampm_regex = r'AM|PM'

    extracted_dates = []
    for regex in date_regexes:
        matches = re.findall(regex, ocr_text)
        for match in matches:
            extracted_dates.append(match)

    parsed_dates = []
    for date_str in extracted_dates:
        try:
            parsed_date = parser.parse(date_str, fuzzy=True)
            parsed_dates.append(parsed_date)
        except ValueError:
            pass

    hour_match = re.search(hour_regex, ocr_text)
    minute_match = re.search(minute_regex, ocr_text)
    ampm_match = re.search(ampm_regex, ocr_text)

    hour = hour_match.group() if hour_match else None
    minute = minute_match.group()[1:] if minute_match else None
    ampm = ampm_match.group() if ampm_match else None

    formatted_dates = [date.strftime("%d %b %Y") for date in parsed_dates]

    combined_datetime = []
    for date in formatted_dates:
        combined_datetime.append(f"{date} {hour}:{minute} {ampm}")

    final_date = ""
    for dt in combined_datetime:
        final_date = dt

    if not final_date:
        # If the date could not be extracted, perform additional preprocessing
        img_preprocessed = preprocess_image(img)
        extracted_date = detect_date_using_ml(img_preprocessed)
        if extracted_date:
            final_date = extracted_date

    return final_date


In [60]:
# Path to the folder containing images
folder_path = "./sample"

# Create or open a text file to write the results
output_file_path = "./output/extracted_dates.txt"
with open(output_file_path, "w") as output_file:
    files = os.listdir(folder_path)

    for file in files:
        if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png"):
            image_path = os.path.join(folder_path, file)
            date = get_date(image_path)
            if date is None or date == "":
                output_file.write(f"Image: {file}, Date: Image quality low\n")
            else:
                output_file.write(f"Image: {file}, Date: {date}\n")

print("Extraction completed. Results written to:", output_file_path)

Extraction completed. Results written to: ./output/extracted_dates.txt
