In [11]:
!pip install pytesseract



In [12]:
!pip install mysql-connector-python



In [10]:
# Step 1: Install required libraries
!pip install pytesseract
!pip install pillow
!pip install opencv-python-headless
!apt-get install -y tesseract-ocr

# Step 2: Import necessary libraries
import cv2
import pytesseract
import re
from google.colab import files

# Define the Aadhar_OCR class
class Aadhar_OCR:
    def __init__(self, img_path):
        self.user_aadhar_no = str()
        self.user_gender = str()
        self.user_dob = str()
        self.user_name = str()

        self.img_name = img_path

    def extract_data(self):
        # Reading the image, extracting text from it, and storing the text into a list.
        img = cv2.imread(self.img_name)
        text = pytesseract.image_to_string(img)
        all_text_list = re.split(r'[\n]', text)

        # Process the text list to remove all whitespace elements in the list.
        text_list = [i for i in all_text_list if not re.match(r'^(\s)+$', i) and i != '']

        # Extracting all the necessary details from the pruned text list.
        # 1) Aadhar Card No.
        aadhar_no_pat = r'^[0-9]{4}\s[0-9]{4}\s[0-9]{4}$'
        for i in text_list:
            if re.match(aadhar_no_pat, i):
                self.user_aadhar_no = i
                break

        # 2) Gender
        aadhar_male_pat = r'(Male|MALE|male)$'
        aadhar_female_pat = r'(Female|FEMALE|female)$'
        for i in text_list:
            if re.search(aadhar_male_pat, i):
                self.user_gender = 'MALE'
                break
            elif re.search(aadhar_female_pat, i):
                self.user_gender = 'FEMALE'
                break

        # 3) DOB
        aadhar_dob_pat = r'(Year|Birth|irth|YoB|YOB:|DOB:|DOB)'
        date_ele = str()
        dob_idx = None
        for idx, i in enumerate(text_list):
            if re.search(aadhar_dob_pat, i):
                index = re.search(aadhar_dob_pat, i).span()[1]
                date_ele = i
                dob_idx = idx
                break

        date_str = ''
        if dob_idx:
            for i in date_ele[index:]:
                if re.match(r'\d', i) or re.match(r'/', i):
                    date_str += i
            self.user_dob = date_str

            # 4) Name
            self.user_name = text_list[dob_idx-1]

        return [self.user_aadhar_no, self.user_gender, self.user_dob, self.user_name]


# Step 3: Upload the Aadhaar card image
uploaded = files.upload()

# Assuming only one file is uploaded, get the file name
image_path = list(uploaded.keys())[0]

# Step 4: Create an instance of Aadhar_OCR and extract data
aadhar_ocr = Aadhar_OCR(image_path)
extracted_data = aadhar_ocr.extract_data()

# Step 5: Print the extracted data
print("Extracted Data:")
print(f"Aadhar No: {extracted_data[0]}")
print(f"Gender: {extracted_data[1]}")
print(f"DOB: {extracted_data[2]}")
print(f"Name: {extracted_data[3]}")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


Saving suraj_Aadhar_front.jpg to suraj_Aadhar_front.jpg
Extracted Data:
Aadhar No: 4948 7529 3377
Gender: MALE
DOB: 31/08/1997
Name: Suraj Harishankar Gupta


In [13]:
# Step 1: Install required libraries
!pip install pytesseract
!pip install pillow
!pip install opencv-python-headless
!apt-get install -y tesseract-ocr

# Step 2: Import necessary libraries
import cv2
import pytesseract
import re
from google.colab import files

# Define the Aadhar_OCR class
class Aadhar_OCR:
    def __init__(self, img_front_path, img_back_path=None):
        self.user_aadhar_no = str()
        self.user_gender = str()
        self.user_dob = str()
        self.user_name = str()
        self.user_address = str()

        self.img_front_name = img_front_path
        self.img_back_name = img_back_path

    def extract_data(self):
        # Reading the front image, extracting text from it, and storing the text into a list.
        img = cv2.imread(self.img_front_name)
        text = pytesseract.image_to_string(img)
        all_text_list = re.split(r'[\n]', text)

        # Process the text list to remove all whitespace elements in the list.
        text_list = [i for i in all_text_list if not re.match(r'^(\s)+$', i) and i != '']

        # Extracting all the necessary details from the pruned text list.
        # 1) Aadhar Card No.
        aadhar_no_pat = r'^[0-9]{4}\s[0-9]{4}\s[0-9]{4}$'
        for i in text_list:
            if re.match(aadhar_no_pat, i):
                self.user_aadhar_no = i
                break

        # 2) Gender
        aadhar_male_pat = r'(Male|MALE|male)$'
        aadhar_female_pat = r'(Female|FEMALE|female)$'
        for i in text_list:
            if re.search(aadhar_male_pat, i):
                self.user_gender = 'MALE'
                break
            elif re.search(aadhar_female_pat, i):
                self.user_gender = 'FEMALE'
                break

        # 3) DOB
        aadhar_dob_pat = r'(Year|Birth|irth|YoB|YOB:|DOB:|DOB)'
        date_ele = str()
        dob_idx = None
        for idx, i in enumerate(text_list):
            if re.search(aadhar_dob_pat, i):
                index = re.search(aadhar_dob_pat, i).span()[1]
                date_ele = i
                dob_idx = idx
                break

        date_str = ''
        if dob_idx:
            for i in date_ele[index:]:
                if re.match(r'\d', i) or re.match(r'/', i):
                    date_str += i
            self.user_dob = date_str

            # 4) Name
            self.user_name = text_list[dob_idx-1]

        # If back image is provided, extract the address
        if self.img_back_name:
            img_back = cv2.imread(self.img_back_name)
            text_back = pytesseract.image_to_string(img_back)
            address_pat = r'(Address|ADDRESS|address)'
            addr_idx = None
            all_text_back_list = re.split(r'[\n]', text_back)
            text_back_list = [i for i in all_text_back_list if not re.match(r'^(\s)+$', i) and i != '']

            # Find the index where the address starts
            for idx, i in enumerate(text_back_list):
                if re.search(address_pat, i):
                    addr_idx = idx
                    break

            # Assuming address spans multiple lines, capture it
            if addr_idx is not None:
                self.user_address = " ".join(text_back_list[addr_idx+1:addr_idx+6])

        return [self.user_aadhar_no, self.user_gender, self.user_dob, self.user_name, self.user_address]


# Step 3: Upload the Aadhaar card images
print("Please upload the front image of Aadhaar card")
uploaded_front = files.upload()

# Assuming only one file is uploaded, get the file name
image_front_path = list(uploaded_front.keys())[0]

print("Please upload the back image of Aadhaar card (Optional)")
uploaded_back = files.upload()

# Check if a back image is provided
if uploaded_back:
    image_back_path = list(uploaded_back.keys())[0]
else:
    image_back_path = None

# Step 4: Create an instance of Aadhar_OCR and extract data
aadhar_ocr = Aadhar_OCR(image_front_path, image_back_path)
extracted_data = aadhar_ocr.extract_data()

# Step 5: Print the extracted data
print("Extracted Data:")
print(f"Aadhar No: {extracted_data[0]}")
print(f"Gender: {extracted_data[1]}")
print(f"DOB: {extracted_data[2]}")
print(f"Name: {extracted_data[3]}")
if extracted_data[4]:
    print(f"Address: {extracted_data[4]}")
else:
    print("Address: Not provided or not found")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Please upload the front image of Aadhaar card


Saving suraj_Aadhar_front.jpg to suraj_Aadhar_front (1).jpg
Please upload the back image of Aadhaar card (Optional)


Saving suraj_Aadhar_back.jpg to suraj_Aadhar_back.jpg
Extracted Data:
Aadhar No: 4948 7529 3377
Gender: MALE
DOB: 31/08/1997
Name: Suraj Harishankar Gupta
Address: Ganpat Patil Nagar, New Link Road, L.C.Colony, Mandapeshwar, Mumbai, Maharashtra, 400103 4948 7529 3377 = 1947 ><] help@uidai.gov.in @ www.uidai.gov.in
