# **1. Libraries Installation**

In [None]:
!sudo apt-get install tesseract-ocr
!pip install pytesseract
!pip install pdf2image
!pip install faker

# **2. Fake ID's Generator**

In [None]:
import pytesseract
import re
from PIL import Image, ImageDraw, ImageFont
from pdf2image import convert_from_path
from faker import Faker
import random

# Define regex patterns
name_pattern = r'\b[A-Z]+\s[A-Z]\.\s[A-Z]+\b'
expiry_pattern = r'EXP:\s\d{2}/\d{2}/\d{4}'

def extract_information(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)

    # Extract name and expiration date
    name_match = re.search(name_pattern, text)
    expiry_match = re.search(expiry_pattern, text)

    name = name_match.group() if name_match else None
    expiry_date = expiry_match.group().replace('EXP: ', '') if expiry_match else None

    return name, expiry_date

# Convert PDF to Image
pdf_path = 'path/to/your/file'
img = convert_from_path(pdf_path)[0]  # Convert the first page to image
image_path = 'path/to/your/file'
img.save(image_path)

# Extract current name and expiry date
current_name, current_expiry_date = extract_information(image_path)
print(f"Current Name: {current_name}")
print(f"Current Expiry Date: {current_expiry_date}")

# Initialize Faker
fake = Faker()

def generate_random_name():
    return f"{fake.last_name().upper()} {fake.first_name().upper()[0]}. {fake.last_name().upper()}"

def generate_random_expiry_date():
    return f"{random.randint(1, 28):02}/{random.randint(1, 12):02}/{random.randint(2024, 2035)}"

def overlay_text_on_image(image_path, name, expiry_date, output_path, font_path):
    img = Image.open(image_path)
    draw = ImageDraw.Draw(img)

    # Load a TrueType font with a specified size
    font_size = 90  # Adjust the size as needed
    font = ImageFont.truetype(font_path, size=font_size)

    # Positions for old text (adjust these coordinates based on your ID layout)
    old_name_position = (100, 1480)
    old_expiry_position = (270, 2000)

    # Color for the white rectangle (to cover old text)
    white = (255, 255, 255)

    # Size of the white rectangles to cover old text (adjust as needed)
    name_width, name_height = draw.textsize('EXAMPLE NAME                      ', font=font)  # Example text to get size
    expiry_width, expiry_height = draw.textsize('EXP: 12/12/2025', font=font)  # Example text to get size

    # Draw white rectangles to cover old text
    draw.rectangle([old_name_position, (old_name_position[0] + name_width, old_name_position[1] + name_height + 25)], fill=white)
    draw.rectangle([old_expiry_position, (old_expiry_position[0] + expiry_width, old_expiry_position[1] + expiry_height + 15)], fill=white)

    # New positions for text
    new_name_position = old_name_position
    new_expiry_position = old_expiry_position

    # Color for the new text
    color = 'blue'

    draw.text(new_name_position, name, fill=color, font=font)
    draw.text(new_expiry_position, f"EXP: {expiry_date}", fill=color, font=font)

    img.save(output_path)

def save_image_as_pdf(image_path, pdf_path):
    img = Image.open(image_path)
    img.convert('RGB').save(pdf_path, 'PDF')

# Example usage
font_path = '/content/Arial.ttf'  # Path to the uploaded font file
for i in range(10):  # Generate 5 examples
    new_name = generate_random_name()
    new_expiry_date = generate_random_expiry_date()
    output_image_path = f'/content/generated_id_{i+1}.png'
    output_pdf_path = f'/content/generated_id_{i+1}.pdf'
    overlay_text_on_image(image_path, new_name, new_expiry_date, output_image_path, font_path)
    save_image_as_pdf(output_image_path, output_pdf_path)
    print(f'Generated ID saved at: {output_pdf_path}')

# **3. Extraction and Matching**

In [None]:
import pytesseract
from pdf2image import convert_from_path
import re
from datetime import datetime, timedelta
import cv2
import numpy as np  # for handling potential image formats

def extract_text_from_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ''
    for img in images:
        # Check image format and convert if necessary
        if type(img) == np.ndarray:
            # Regular PDF content, proceed with OpenCV
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        else:
            # Likely embedded image, convert to NumPy array
            img_array = np.array(img)
            gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)

        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        text += pytesseract.image_to_string(thresh, config='--psm 6')  # Page layout for single block
    return text

def extract_employee_info(text):
    # Updated name pattern
    name_pattern = r'\b[A-Z]+\s[A-Z]\.\s[A-Z]+\b'
    expiry_pattern = r'EXP:\s\d{2}/\d{2}/\d{4}'

    name_match = re.search(name_pattern, text)
    expiry_match = re.search(expiry_pattern, text)

    if name_match and expiry_match:
        full_name = name_match.group(0).strip()
        expiry_date = expiry_match.group(0).strip()
        return full_name, expiry_date
    return None, None

def julian_to_gregorian(julian_date_str):
    # Extract year and day of the year
    year = '20' + julian_date_str[:2]
    day_of_year = int(julian_date_str[2:])

    # Convert to Gregorian date
    date = datetime(int(year), 1, 1) + timedelta(days=day_of_year - 1)
    return date.strftime('EXP: %d/%m/%Y')

def main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry):
    pdf_text = extract_text_from_pdf(pdf_path)
    print(pdf_text)
    full_name, expiry_date = extract_employee_info(pdf_text)
    print(full_name, expiry_date)
    if full_name and expiry_date:
        # Combine ticket information for matching
        ticket_full_name = f"{ticket_first_name} {ticket_father_initial}. {ticket_last_name}"

        # Convert ticket Julian expiry date to Gregorian date for comparison
        ticket_expiry_date = julian_to_gregorian(ticket_julian_expiry)
        print(ticket_full_name, ticket_expiry_date)
        if full_name == ticket_full_name and expiry_date == ticket_expiry_date:
            return "Match Found"
        else:
            return "No Match Found"
    return "Extraction Failed"

# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "RAYAN"
ticket_father_initial = "N"
ticket_last_name = "BESHAWRI"
ticket_julian_expiry = "24221"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 1**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "MAHONEY"
ticket_father_initial = "J"
ticket_last_name = "REID"
ticket_julian_expiry = "24109"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 2**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "PEREZ"
ticket_father_initial = "J"
ticket_last_name = "ORTEGA"
ticket_julian_expiry = "28081"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 3**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "VILLEGAS"
ticket_father_initial = "M"
ticket_last_name = "EDWARDS"
ticket_julian_expiry = "31126"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 4**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "MANN"
ticket_father_initial = "J"
ticket_last_name = "PERRY"
ticket_julian_expiry = "34318"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 5**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "CALHOUN"
ticket_father_initial = "M"
ticket_last_name = "SMITH"
ticket_julian_expiry = "26112"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 6**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "NEAL"
ticket_father_initial = "N"
ticket_last_name = "STEWART"
ticket_julian_expiry = "33112"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 7**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "DELEON"
ticket_father_initial = "R"
ticket_last_name = "MOORE"
ticket_julian_expiry = "28141"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 8**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "HARPER"
ticket_father_initial = "S"
ticket_last_name = "JOHNSON"
ticket_julian_expiry = "29320"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 9**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "WHITE"
ticket_father_initial = "A"
ticket_last_name = "GRAVES"
ticket_julian_expiry = "26276"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)

# **Test 10**

In [None]:
# Update this path with the actual path to your PDF file
pdf_path = "path/to/your/file.pdf"
ticket_first_name = "DAVIS"
ticket_father_initial = "P"
ticket_last_name = "ROSS"
ticket_julian_expiry = "27023"

result = main(pdf_path, ticket_first_name, ticket_father_initial, ticket_last_name, ticket_julian_expiry)
print(result)