# Image Processing & Optical Character Recognition (OCR):

The goal of this project is to develop a Python script using the OpenCV and Tesseract libraries to extract relevant information from a single image containing four cheques stacked on top of each other. 
- The extracted information includes the bank name, cheque date, and cheque amount for each of the four cheques. 
- The structured data will then be saved to an Excel spreadsheet, and individual cheques will be separated and saved as image files in an output folder, with the bank name used as the cheque name.

## Importing the Libraries:

In [1]:
import cv2                # OpenCV: Library for computer vision tasks like image processing.
import pytesseract        # PyTesseract: Python wrapper for Tesseract OCR engine, used for text extraction from images.
import pandas as pd       # Pandas: Library for data manipulation and analysis, used for structuring the extracted data.
import os                 # OS: Library for interacting with the operating system, used for file handling operations.
from PIL import Image     # PIL: Python Imaging Library, used for opening and manipulating images.
import re                 # Regular Expressions: Library for pattern matching and text processing.
import warnings           # Warnings: Python module for handling warning messages.
warnings.filterwarnings('ignore')  # Ignore warnings to prevent cluttering the output.

## Process & Save Each Individual Cheque:

In [2]:
# Function to process each individual cheque and save it to the Output folder
def process_and_save_individual_cheques(image_path, output_folder):
    # Read the image using OpenCV
    image = cv2.imread(image_path)
    h, w, _ = image.shape   # Get the height and width of the image

    # Extract the bank name from the image path
    bank_name = image_path.split("/")[-1].split(".")[0]

    # Assuming the individual cheques are evenly spaced vertically
    num_cheques = 4
    height_per_cheque = h // num_cheques

    # Loop through each cheque in the image
    for i in range(num_cheques):
        # Calculate the y-coordinates to crop each individual cheque
        start_y = i * height_per_cheque
        end_y = (i + 1) * height_per_cheque

        # Crop the individual cheque from the original image
        cheque_image = image[start_y:end_y, :]

        # Save the individual cheque as a separate image file in the Output folder
        output_file = os.path.join(output_folder, f"Cheque_{i + 1}.png")
        cv2.imwrite(output_file, cheque_image)

In [3]:
# Replace the image_path variable with the path to your input image containing the cheques
image_path = "cheque-sample.jpg"

# Set the output folder where individual cheques will be saved
output_folder = "Output"

# Check if the output folder exists, if not, create it
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Call the function to process and save individual cheques from the input image
process_and_save_individual_cheques(image_path, output_folder)

## Creating the Excel File:
### Name of the Bank:

In [4]:
# Function to preprocess the image (optional but may improve OCR accuracy)
def preprocess_image(image_path):
    # Load the image using PIL (Python Imaging Library)
    cheque_image = Image.open(image_path)
    
    # Convert the image to grayscale
    cheque_image_gray = cheque_image.convert("L")
    
    return cheque_image_gray

# Function to extract text using Tesseract OCR (Same as before)
def extract_text_from_image(image):
    # Perform OCR using pytesseract
    extracted_text = pytesseract.image_to_string(image)
    return extracted_text

# Define a function to extract the bank name from the text
def extract_bank_name(text):
    # Regular expression to find the word before "BANK"
    bank_name_pattern = re.compile(r"(\w+)\s+(?i)BANK\b")
    
    # Search for the pattern in the text
    match = bank_name_pattern.search(text)

    if match:
        # Get the word before "BANK"
        bank_name = match.group(1)
        return bank_name.upper()
    
    return "Bank name not found"

# Directory path where the cheque images are stored
output_folder_path = "Output"

# Get a list of all the files in the "Output" folder
cheque_files = os.listdir(output_folder_path)

# Initialize an empty list to store the extracted bank names
extracted_bank_names_list = []

# Loop through each file in the "Output" folder
for file in cheque_files:
    # Check if the file is an image (you may need to add more checks based on your file types)
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        # Construct the full path to the image
        image_path = os.path.join(output_folder_path, file)

        # Preprocess the image
        preprocessed_image = preprocess_image(image_path)

        # Extract the text from the image
        extracted_text = extract_text_from_image(preprocessed_image)

        # Extract the bank name from the text
        bank_name = extract_bank_name(extracted_text)

        # Append the extracted bank name to the list
        bank_name = bank_name + " BANK"
        extracted_bank_names_list.append(bank_name)
        
        # Generate the new file name
        new_file_name = f"{bank_name}.jpg"

        # Construct the new file path
        new_file_path = os.path.join(output_folder_path, new_file_name)

        # Rename the file
        os.rename(image_path, new_file_path)

# Create a DataFrame from the list of extracted bank names
df = pd.DataFrame({'Bank Name': extracted_bank_names_list})
df

Unnamed: 0,Bank Name
0,SYNDICATE BANK
1,ICICI BANK
2,CANARA BANK
3,AXIS BANK


### Date of the Cheque:

In [5]:
# Function to preprocess the image
def preprocess_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply Gaussian blur to reduce noise and improve OCR accuracy
    blur = cv2.GaussianBlur(gray, (5, 5), 0)

    # Apply Otsu's thresholding to binarize the image and make the text stand out
    thresh = cv2.threshold(blur, 180, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    return thresh

# Function to extract text using Tesseract OCR
def extract_text_from_image(image_path):
    # Custom configuration for Tesseract OCR
    custom_config = r'--oem 1 --psm 3'

    # Perform OCR using pytesseract on the provided image with the custom configuration
    text = pytesseract.image_to_string(image_path, config=custom_config)

    return text

# This function uses Tesseract OCR to extract text from the provided 
# image using a custom configuration. The custom configuration 
# (custom_config) is set to --oem 1 --psm 3, which specifies OCR Engine 
# Mode 1 and Page Segmentation Mode 3. These settings are optimized for 
# sparse text with uniform character sizes.

# Function to parse the extracted text and extract the date using datefinder
def extract_date_from_text(extracted_text):
    # Regular expression patterns for different date formats
    date_patterns = [
        r"\b\d{1,2}-\d{1,2}-\d{2,4}\b",        # dd-mm-yyyy
        r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",        # dd/mm/yyyy
        r"\b\d{4}-\d{2}-\d{2}\b",              # yyyy-mm-dd
        r"\b\d{1,2}-\w{3}-\d{2,4}\b",
        r"\b\d{1,2}[.-]\w+[.-]\d{2,4}\b",        # dd month yyyy (e.g., 31 December 2023)
        r"\b\d{1,2}-\w+-\d{2,4}\b",            # dd-MMM-yyyy (e.g., 5-Sep-2019)
        r"\b\d{1,2}[.-]\d{1,2}[.-]\d{2,4}\b",  # dd.mm.yyyy or dd-mm-yyyy (e.g., 06.09.2019 or 06-09-2019)
        r"\b\d{1,2}[.-/]\w+[.-/]\d{2,4}\b",      # dd-MMM-yyyy or dd/Sep/2019 (e.g., 5-Sep-2019 or 5/Sep/2019)
        r"\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b",  # dd.mm.yyyy or dd-mm-yyyy or dd/mm/yyyy (e.g., 5.09.2019 or 5-09-2019 or 5/09/2019)
        r"\b\d{1,2}\s+\w+\s+\d{2,4}\b",        # dd month yyyy (e.g., 31 December 2023)
        # Add more date patterns as needed for your specific case
    ]

    # Search for a matching date pattern in the extracted text
    date = None
    for pattern in date_patterns:
        match = re.search(pattern, extracted_text)
        if match:
            date = match.group(0)
            break

    return date

def convert_date_to_ddmmyyyy(date_str):
    # Define the dictionary to map month names to their numeric representation
    month_names = {
        'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
        'may': '05', 'jun': '06', 'jul': '07', 'aug': '08',
        'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'
    }

    # Split the date string by space, hyphen, slash, or dot to extract day, month, and year components
    parts = date_str.lower().replace('-', ' ').replace('/', ' ').replace('.', ' ').split()

    day = parts[0]
    month = parts[1]
    year = parts[2]

    # Convert month name to numeric representation if necessary
    if month.isalpha():
        month = month_names.get(month[:3], '')

    # Ensure proper padding for day and month values
    day = day.zfill(2)
    month = month.zfill(2)

    # Concatenate the components in "dd-mm-yyyy" format
    formatted_date = f"{day}-{month}-{year}"

    return formatted_date


# Function to process each cheque image and extract the date
def process_cheque_image(image_path):
    # Preprocess the image using the functions defined above
    preprocessed_image = preprocess_image(image_path)

    # Extract text using Tesseract OCR from the preprocessed image
    extracted_text = extract_text_from_image(preprocessed_image)

    # Extract the date from the extracted text using datefinder
    date = extract_date_from_text(extracted_text)

    # Convert the extracted date to "dd-mm-yyyy" format
    date = convert_date_to_ddmmyyyy(date)

    return date

# Directory path where the cheque images are stored
output_folder_path = "Output"

# Get a list of all the files in the "Output" folder
cheque_files = os.listdir(output_folder_path)

# Initialize an empty list to store the extracted bank names and dates
extracted_data_list = []

# Loop through each file in the "Output" folder
for file in cheque_files:
    # Check if the file is an image
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        # Extract the bank name from the image name
        bank_name = os.path.splitext(file)[0].upper()

        # Construct the full path to the image
        image_path = os.path.join(output_folder_path, file)

        # Process the image and extract the date
        extracted_date = process_cheque_image(image_path)

        # Append the extracted bank name and date to the list as a tuple
        extracted_data_list.append((bank_name, extracted_date))

# Create a DataFrame from the list of extracted bank names and dates
df = pd.DataFrame(extracted_data_list, columns=['Bank Name', 'Date'])
df

Unnamed: 0,Bank Name,Date
0,AXIS BANK,06-09-2019
1,CANARA BANK,03-09-2019
2,ICICI BANK,05-09-2019
3,SYNDICATE BANK,30-01-2019


### Amount of the Cheque:

In [6]:
# Function to extract the amount value from the text
def extract_amount_from_text(text):
    # Regular expression pattern for extracting the amount value
    # The pattern matches amounts with format: 1,000.00 (thousands separator and decimal point)
    pattern = r"\b\d{1,3}\,\d{3}\.\d{2}\b"

    # Match the amount value against the regular expression pattern
    match = re.search(pattern, text)

    # If the amount value is found, return it
    if match:
        amount = match.group(0)
    else:
        amount = None

    return amount

# Function to process each cheque image and extract the amount
def process_cheque_image(image_path):
    # Extract the text from the image using OCR
    extracted_text = extract_text_from_image(image_path)

    # Extract the amount value from the extracted text
    amount = extract_amount_from_text(extracted_text)

    return amount

# Initialize an empty list to store the extracted bank names and amounts
extracted_data_list = []

# Loop through each file in the "Output" folder
for file in cheque_files:
    # Check if the file is an image:
    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
        # Extract the bank name from the image name
        bank_name = os.path.splitext(file)[0].upper()

        # Construct the full path to the image
        image_path = os.path.join(output_folder_path, file)

        # Process the image and extract the amount
        extracted_amount = process_cheque_image(image_path)

        # Append the extracted bank name and amount to the list as a tuple
        extracted_data_list.append((bank_name, extracted_amount))

# Create a DataFrame from the list of extracted bank names and amounts
df1 = pd.DataFrame(extracted_data_list, columns=['Bank Name', 'Amount'])

# Convert the "Amount" column to float format
df1['Amount'] = df1['Amount'].str.replace(',', '').astype(float)
df1

Unnamed: 0,Bank Name,Amount
0,AXIS BANK,130354.7
1,CANARA BANK,126888.0
2,ICICI BANK,56476.0
3,SYNDICATE BANK,5407.5


### Finally, the Excel File:

In [7]:
# Merge the two DataFrames on 'Bank Name'
merged_df = pd.merge(df1, df, on='Bank Name')

# Rename the columns as required
merged_df.rename(columns={'Date': 'Cheque Date', 'Amount': 'Cheque Amount'}, inplace=True)

# Sort the columns in the particular order
merged_df = merged_df[['Bank Name', 'Cheque Date', 'Cheque Amount']]

# Export the merged DataFrame to an Excel file
output_folder_path = "Output"
output_file_path = os.path.join(output_folder_path, "Output File.xlsx")
merged_df.to_excel(output_file_path, index=False)

print("Data exported to:", output_file_path)

Data exported to: Output\Output File.xlsx
