# Date Extraction

## Pre-requisite

Install dependencies from requirements file `pip install -r ./requirements.txt`

## Usage

```python
date_extractor = DateExtractor()
results = date_extractor.get_dates('./data/output2/')
print(results)
```

## Model

In [1]:
import cv2
import pytesseract
import os
import re
from dateutil.parser import parse, ParserError
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import warnings
import logging

# Suppress specific warnings from transformers
warnings.filterwarnings('ignore', message='Using the model-agnostic default `max_length`')

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class DateExtractor:
    def __init__(self, model_path=None):
        self.model, self.processor = self.load_model(model_path)

    @staticmethod
    def load_model(model_path=None):
        if model_path and os.path.isdir(model_path):
            model = VisionEncoderDecoderModel.from_pretrained(model_path)
            processor = TrOCRProcessor.from_pretrained(model_path)
        else:
            model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
            processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
        return model, processor

    @staticmethod
    def preprocess_image(image_path):
        img = cv2.imread(image_path)
        if img is None:
            return None
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, img = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
        return img

    @staticmethod
    def find_date_in_text(text):
        patterns = [
            r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
            r'\b\d{1,2}-\d{1,2}-\d{2,4}\b',
            r'\b\d{2,4}-\d{1,2}-\d{1,2}\b',
            r'\b\d{1,2} \d{1,2} \d{4}\b',
            r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b',
            r'\b\d{1,2}(st|nd|rd|th)? (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b'
        ]
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    parsed_date = parse(match.group(), fuzzy=True)
                    parsed_date = parsed_date.strftime('%m-%d-%Y')
                    return parsed_date
                except ParserError:
                    continue
        return None

    def recognize_system_date_from_images(self, preprocessed_image):
        extracted_system_text = ""
        if preprocessed_image is not None:
            extracted_system_text = pytesseract.image_to_string(preprocessed_image, config='--oem 3 --psm 6')
            extracted_system_date = self.find_date_in_text(extracted_system_text)
            if extracted_system_date:
                return extracted_system_date
        return None

    def recognize_handwritten_date_from_images(self, image_path):
        preprocessed_pil_image = Image.open(image_path).convert("RGB")
        extracted_handwritten_text = ""
        if preprocessed_pil_image is not None:
            pixel_values = self.processor(images=preprocessed_pil_image, return_tensors="pt").pixel_values
            generated_ids = self.model.generate(pixel_values)
            extracted_handwritten_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            extracted_handwritten_date = self.find_date_in_text(extracted_handwritten_text)
            if extracted_handwritten_date:
                return extracted_handwritten_date
        return None

    def get_dates(self, folder_path):
        predictions = []
        file_names = []
        
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.jpg') or file_name.endswith('.jpeg') or file_name.endswith('.png'):
                full_file_path = os.path.join(folder_path, file_name)
                try:
                    preprocessed_image = self.preprocess_image(full_file_path)
                    if preprocessed_image is None:
                        continue

                    predicted_date = self.recognize_system_date_from_images(preprocessed_image)
                    if predicted_date:
                        predictions.append(predicted_date)
                    else:
                        predicted_date = self.recognize_handwritten_date_from_images(full_file_path)
                        if predicted_date:
                            predictions.append(predicted_date)
                        else:
                            predictions.append(None)

                    file_names.append(file_name)
                    logging.info(f"File: {full_file_path} -> Extracted Date: {predicted_date}")
                except Exception as e:
                    # logging.error(f"Error processing {full_file_path}: {e}")
                    continue
        
        return {
            "file_names": file_names, 
            "predictions": predictions
        }

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
date_extractor = DateExtractor()

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Load the class instance from the pickle file
results = date_extractor.get_dates('./data/output2/')
print(results)

2024-05-25 12:15:55,786 - INFO - File: ./data/output2/103box_1.jpg -> Extracted Date: 05-31-2023
2024-05-25 12:15:55,922 - INFO - File: ./data/output2/107box_1.jpg -> Extracted Date: 06-01-2023
2024-05-25 12:15:56,049 - INFO - File: ./data/output2/107box_2.jpg -> Extracted Date: 06-01-2023
2024-05-25 12:15:58,190 - INFO - File: ./data/output2/107box_3.jpg -> Extracted Date: None
2024-05-25 12:15:58,351 - INFO - File: ./data/output2/114box_1.jpg -> Extracted Date: 06-06-2023
2024-05-25 12:15:58,468 - INFO - File: ./data/output2/114box_2.jpg -> Extracted Date: 06-06-2023
2024-05-25 12:15:58,601 - INFO - File: ./data/output2/117box_1.jpg -> Extracted Date: 08-24-2023
2024-05-25 12:15:58,737 - INFO - File: ./data/output2/117box_2.jpg -> Extracted Date: 08-24-2023
2024-05-25 12:15:58,862 - INFO - File: ./data/output2/117box_3.jpg -> Extracted Date: 08-24-2023
2024-05-25 12:16:04,010 - INFO - File: ./data/output2/122box_1.jpg -> Extracted Date: None
2024-05-25 12:16:04,207 - INFO - File: ./d

{'file_names': ['103box_1.jpg', '107box_1.jpg', '107box_2.jpg', '107box_3.jpg', '114box_1.jpg', '114box_2.jpg', '117box_1.jpg', '117box_2.jpg', '117box_3.jpg', '122box_1.jpg', '122box_2.jpg', '127box_1.jpg', '127box_2.jpg', '12box_1.jpg', '130box_1.jpg', '130box_2.jpg', '134box_1.jpg', '134box_2.jpg', '137box_1.jpg', '137box_2.jpg', '139box_1.jpg', '139box_2.jpg', '152box_1.jpg', '152box_2.jpg', '156box_1.jpg', '158box_1.jpg', '158box_2.jpg', '15box_1.jpg', '15box_2.jpg', '15box_3.jpg', '169box_1.jpg', '169box_2.jpg', '169box_3.jpg', '16box_1.jpg', '16box_2.jpg', '173box_1.jpg', '183box_1.jpg', '189box_1.jpg', '193box_1.jpg', '197box_1.jpg', '202box_1.jpg', '213box_1.jpg', '213box_2.jpg', '218box_1.jpg', '218box_2.jpg', '21box_1.jpg', '21box_2.jpg', '21box_3.jpg', '223box_1.jpg', '236box_1.jpg', '236box_2.jpg', '239box_1.jpg', '239box_2.jpg', '242box_1.jpg', '242box_2.jpg', '247box_1.jpg', '250box_1.jpg', '250box_2.jpg', '250box_3.jpg', '259box_1.jpg', '259box_2.jpg', '25box_1.jpg', '2