In [1]:
import re

def extract_aadhar_details(text):
    name_pattern = r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)'
    dob_pattern = r'(?:DOB|Date of Birth|जन्म तिथि)[^\d]*(\d{2}[/-]\d{2}[/-]\d{4})'
    gender_pattern = r'(MALE|FEMALE|TRANSGENDER|पुरुष|महिला|ट्रांसजेंडर)'
    aadhar_pattern = r'(\d{4}\s\d{4}\s\d{4}|XXXX\sXXXX\sXXXX)'

    name = re.findall(name_pattern, text)
    dob = re.search(dob_pattern, text)
    gender = re.search(gender_pattern, text)
    aadhar = re.search(aadhar_pattern, text)

    return {
        "Name": name[0] if name else None,
        "Date of Birth": dob.group(1) if dob else None,
        "Gender": gender.group(1) if gender else None,
        "Aadhaar": aadhar.group(1) if aadhar else None
    }


In [79]:
import easyocr

def extract_text_from_image(image):
    reader = easyocr.Reader(['en', 'hi'])  # Aadhaar uses both
    results = reader.readtext(image)
    text_blocks = [text[1] for text in results if text[2] > 0.1]  # confidence threshold
    return " ".join(text_blocks)


In [80]:
def preprocess_image(image_path):
    import cv2
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        raise FileNotFoundError(f"Could not load image at path: {image_path}")
    image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    _, thresh = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return thresh


def normalize_digits(text):
    devanagari_nums = '०१२३४५६७८९'
    english_nums = '0123456789'
    trans_table = str.maketrans(devanagari_nums, english_nums)
    return text.translate(trans_table)


In [81]:
import re

def extract_aadhaar_fields(raw_text):
    text = normalize_digits(raw_text)

    # Extract DOB
    dob = re.search(r'\d{2}/\d{2}/\d{4}', text)
    dob = dob.group() if dob else None

    # Extract Aadhaar number
    aadhaar = re.search(r'\d{4}\s\d{4}\s\d{4}', text)
    aadhaar = aadhaar.group() if aadhaar else None

    # Extract Gender
    gender = "MALE" if "MALE" in text.upper() or "पुरुष" in text else (
             "FEMALE" if "FEMALE" in text.upper() or "महिला" in text else None)

    # Extract Name
    # Assume name is the sequence of words near "Rishav", or the last capitalized English name
    name_match = re.search(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)', text)
    name = name_match.group() if name_match else None

    return {
        "Name": name,
        "DOB": dob,
        "Gender": gender,
        "Aadhaar": aadhaar
    }


raw = "भारत सरकार Coiarit d Iida आधार ऋषभ रंजन Rishav Ranjan जन्म तिथि 008: 16/10/2005 पुरुष / MALE 3016 7523 3916 मेरा आधार , मेरी पहचान"
data = extract_aadhaar_fields(raw)
print(data)

{'Name': 'Rishav Ranjan', 'DOB': '16/10/2005', 'Gender': 'MALE', 'Aadhaar': '3016 7523 3916'}


In [90]:
image_path = "WhatsApp Image 2025-04-18 at 1.36.20 PM.jpeg"
pre_img = preprocess_image(image_path)
text = extract_text_from_image(pre_img)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [92]:
data = extract_aadhaar_fields(text)
print(text)

m्म रिपारी Suslwm Tiwar पुएम/ MALE 2008 9464 5177 मेरा आधार, मेरी पहचान


In [105]:
import re

def normalize_digits(text):
    # Convert Hindi numerals to English (if any)
    devanagari_nums = '०१२३४५६७८९'
    english_nums = '0123456789'
    return text.translate(str.maketrans(devanagari_nums, english_nums))

def extract_clean_fields(raw_text):
    text = normalize_digits(raw_text)
    
    # Extract Name (first proper capitalized name with 2+ words)
    name_match = re.search(r'\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)\b', text)
    name = name_match.group(1) if name_match else None

    # Extract DOB
    dob_match = re.search(r'\d{2}/\d{2}/\d{4}', text)
    dob = dob_match.group() if dob_match else None

    # Extract Gender
    if 'MALE' in text:
        gender = 'MALE'
    elif 'FEMALE' in text:
        gender = 'FEMALE'
    else:
        gender = None

    # Extract Aadhaar number (12-digit format: xxxx xxxx xxxx)
    aadhaar_match = re.search(r'\b\d{4}[\s\-\.]?\d{4}[\s\-\.]?\d{4}\b', text)
    aadhaar = aadhaar_match.group().replace('-', ' ').replace('.', ' ') if aadhaar_match else None

    return {
        'Name': name,
        'DOB': dob,
        'Gender': gender,
        'Aadhaar': aadhaar
    }

def truncate_from_vid(text):
    keyword = "VID"
    if keyword in text:
        return text.split(keyword)[0].strip()  # remove everything from VID onwards
    return text.strip()  # return original if VID not found



In [119]:
text = "|Priyanka Kumari|DOB: 1995-17-06|Female|9147385602|3Te|"
text = normalize_digits(text)
text = truncate_from_vid(text)
text

'|Priyanka Kumari|DOB: 1995-17-06|Female|9147385602|3Te|'

In [120]:
extract_clean_fields(text)

{'Name': 'Priyanka Kumari', 'DOB': None, 'Gender': None, 'Aadhaar': None}

In [113]:
from datetime import datetime
text = "07/02/2005"
date_obj = datetime.strptime(text, "%d/%m/%Y")
date_obj.strftime('%Y-%m-%d')

'2005-02-07'

In [123]:
arr = '''[-0.20647378  0.11742667  0.08624212 -0.02662629 -0.02220517 -0.0058797
  0.0152042  -0.05426366  0.15940809 -0.06140548  0.23382628 -0.01395828
 -0.15439057 -0.17287491 -0.05324142  0.06056122 -0.11947733 -0.18890727
  0.01155647 -0.10165109  0.12306701  0.03594084 -0.01264772  0.06696057
 -0.2272352  -0.33907688 -0.0233891  -0.16224524  0.06986827 -0.12414453
 -0.03294193  0.03045478 -0.18189214  0.00584281 -0.06458424  0.13756827
  0.05002759  0.02797477  0.13048871  0.03760564 -0.14098807 -0.02747709
  0.00310716  0.31195199  0.16016361  0.01340836  0.01808142 -0.02546342
  0.05598837 -0.20404287  0.02328693  0.15806101  0.14099808  0.00469697
  0.10017949 -0.10682328 -0.03508019 -0.03065503 -0.16978313  0.06344254
 -0.01457671 -0.05493565 -0.05154479 -0.03924793  0.14494345  0.02093075
 -0.07811058 -0.08003723  0.16888157 -0.15775311 -0.03300582  0.11579999
 -0.02659567 -0.14969641 -0.20441915  0.08405729  0.42419243  0.14291857
 -0.17689601  0.08765633 -0.1613453  -0.05762089  0.04526284 -0.01771237
 -0.08877673  0.05154691 -0.17859621  0.06384777  0.14225556  0.00367226
  0.03374977  0.15089396  0.00720143  0.03013624  0.03696548  0.05425546
 -0.13831723  0.0964893  -0.03328162 -0.02715365  0.07312605 -0.11696307
  0.01829132  0.0389063  -0.21186319  0.12293202  0.03941235 -0.06246315
  0.00533076  0.0618738  -0.11821048 -0.03742075  0.11439694 -0.28637981
  0.13183531  0.17933178 -0.0095038   0.19305216  0.08872113  0.0557092
 -0.00841733 -0.06017147 -0.08541488 -0.08862396 -0.0031976  -0.06499568
  0.0075346   0.03265548]'''

In [126]:
arr_fixed = "[" + ", ".join(arr.strip("[]").split()) + "]"

In [131]:
import ast
import numpy as np
result = np.array(ast.literal_eval(arr_fixed))
print(type(result))
print(result)

<class 'numpy.ndarray'>
[-0.20647378  0.11742667  0.08624212 -0.02662629 -0.02220517 -0.0058797
  0.0152042  -0.05426366  0.15940809 -0.06140548  0.23382628 -0.01395828
 -0.15439057 -0.17287491 -0.05324142  0.06056122 -0.11947733 -0.18890727
  0.01155647 -0.10165109  0.12306701  0.03594084 -0.01264772  0.06696057
 -0.2272352  -0.33907688 -0.0233891  -0.16224524  0.06986827 -0.12414453
 -0.03294193  0.03045478 -0.18189214  0.00584281 -0.06458424  0.13756827
  0.05002759  0.02797477  0.13048871  0.03760564 -0.14098807 -0.02747709
  0.00310716  0.31195199  0.16016361  0.01340836  0.01808142 -0.02546342
  0.05598837 -0.20404287  0.02328693  0.15806101  0.14099808  0.00469697
  0.10017949 -0.10682328 -0.03508019 -0.03065503 -0.16978313  0.06344254
 -0.01457671 -0.05493565 -0.05154479 -0.03924793  0.14494345  0.02093075
 -0.07811058 -0.08003723  0.16888157 -0.15775311 -0.03300582  0.11579999
 -0.02659567 -0.14969641 -0.20441915  0.08405729  0.42419243  0.14291857
 -0.17689601  0.08765633 -0.