In [154]:
from PIL import Image, ImageOps, ImageFilter
import pytesseract
import re
import os
import cv2
import numpy as np
import pandas as pd


In [None]:
def load_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        raise FileNotFoundError(f"Image not found at {image_path}")
    return img

def crop_profile(image):
    height, width, _ = image.shape
    
    # name image
    name_top_crop = int(height * 0.07) 
    name_bottom_crop = int(height * 0.12)
    name_left_crop = int(width * 0.22)
    name_right_crop = int(width * 0.78)
    name_img = image[
        name_top_crop:name_bottom_crop, 
        name_left_crop:name_right_crop
    ]

    # follower image
    follower_top_crop = int(height * 0.255) 
    follower_bottom_crop = int(height * 0.34) #0.34
    follower_left_crop = int(width * 0.1)
    follower_right_crop = int(width * 0.9)
    follower_img = image[
        follower_top_crop:follower_bottom_crop,
        follower_left_crop:follower_right_crop
    ]
    
    # bio image
    bio_top_crop = int(height * 0.40)
    bio_bottom_crop = int(height * 0.55)
    bio_left_crop = int(width * 0.1)
    bio_right_crop = int(width * 0.9)
    bio_img = image[
        bio_top_crop:bio_bottom_crop,
        bio_left_crop:bio_right_crop
    ]
    return name_img, follower_img, bio_img

def to_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def denoise_image(image):
    # Applies median blurring for noise reduction
    # Kernel size 5 is common, adjust if needed
    # Tried 5, 3
    return cv2.medianBlur(image, 3)

def binarize_image(image):
    # Applies Otsu's thresholding for binarization
    # Use THRESH_OTSU for automatic threshold calculation
    # _, binary_img = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    _, binary_img = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)
    return binary_img

def upscale_image(image, scale_factor=2):
    # Upscales an image using linear interpolation
    return cv2.resize(image, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)

def binarize_image_adaptive(image):
    # ADAPTIVE_THRESH_GAUSSIAN_C or ADAPTIVE_THRESH_MEAN_C
    # blockSize: Size of a pixel neighborhood that is used to calculate a threshold value
    # C: Constant subtracted from the mean or weighted mean
    binary_img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                       cv2.THRESH_BINARY, 11, 2) # Adjust 11 and 2
    return binary_img

def denoise_image_gaussian(image):
    return cv2.GaussianBlur(image, (3, 3), 0) # Small kernel, 0 sigma for automatic

def preprocess_pipeline(img):

    upscaled_img = upscale_image(img, scale_factor=2)
    gray_img = to_grayscale(upscaled_img)
    denoised_img = denoise_image_gaussian(gray_img)
    binary_img = binarize_image_adaptive(denoised_img)

    return binary_img

def image_extraction(name_img, follower_img, bio_img):
    extracted_information = {
        'name': None,
        'username': None,
        'follower_count': None,
        'email': None,
        'website': None,
        'bio': None,
    }
    name_text = pytesseract.image_to_string(name_img, lang='eng')
    follower_text = pytesseract.image_to_string(follower_img, lang = 'eng')
    # config=' -c tessedit_char_whitelist=0123456789,.KM'
    print(follower_text)
    bio_text = pytesseract.image_to_string(bio_img, lang='eng')

    name_pattern = re.compile(r'^(.*)')
    username_pattern = re.compile(r"@([a-zA-Z0-9._]+)")
    follower_count_pattern = re.compile(r'@[\w\.]+\s+[\d,|= -]+?\s+([\doOKkMm\.]+)\s+Following Followers Likes')
    email_pattern = re.compile(r"\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b")
    website_pattern = re.compile(r"(https?://[^\s]+|www\.[^\s]+)")  
    

    name_match = name_pattern.search(name_text)
    extracted_information['bio'] = bio_text.strip()
    if name_match:
        extracted_information['name'] = name_match.group(1).strip()
    username_match = username_pattern.search(follower_text)
    if username_match:
        extracted_information["username"] = username_match.group(1)
    follower_count_match = follower_count_pattern.search(follower_text)
    if follower_count_match:
        extracted_information["follower_count"] = follower_count_match.group(1)
    email_match = email_pattern.search(bio_text)
    if email_match:
        extracted_information["email"] = email_match.group(1)
    website_match = website_pattern.search(bio_text)
    if website_match:
        extracted_information["website"] = website_match.group(1)

    return extracted_information
    
def preprocessing_image(image_path):

    img = load_image(image_path)
    name_img, follower_img , bio_img = crop_profile(img)

    name_img = preprocess_pipeline(name_img)
    follower_img = preprocess_pipeline(follower_img)
    bio_img = preprocess_pipeline(bio_img)

    # cv2.imwrite("preprocessed_name.png", name_img)
    # cv2.imwrite('preprocessed_follower.png', follower_img)
    # cv2.imwrite("preprocessed_bio.png", bio_img)

    # return image_extraction(name_img, bio_img)

    # cv2.imshow('name', name_img)
    # cv2.imshow('bio', bio_img)
    # name_text = pytesseract.image_to_string(name_img, lang='eng')
    # follower_text = pytesseract.image_to_string(follower_img, config='--psm 6 -c tessedit_char_whitelist=0123456789,.KM')
    # bio_text = pytesseract.image_to_string(bio_img, lang='eng')
    return name_img, follower_img, bio_img

def entire_folder(folder_path):
    df = pd.DataFrame(columns = ['name', 'username', 'follower_count', 'email', 'website', 'bio'])
    for file in os.listdir(folder_path)[0:10]:
        name, follower, bio = preprocessing_image(f'{folder_path}/{file}')
        extracted_info = image_extraction(name, follower, bio)
        df.loc[len(df)] = extracted_info
    return df

In [176]:
df = entire_folder('photos_sample')
df

@kb_houston
439 | 1,147 647.8iK

Following Followers Likes

@honeybloomcompany

1,290 | 963 8,355

Following Followers Likes

@tiny.cabin.montessori

21 | 4,816 274K

Following Followers Likes

@victoria.lynagh
130 6,888 | 1.1

Following Followers Likes

@brittdonner
673 | 1,582 15.2K

Following Followers Likes

@hamilylife
1,865 | 15.6K 656.71K

Following Followers Likes

@saf_edwards_firsttimemum

580 | 6,413 313.2K

Following Followers Likes

@mama_nurse_tina

668 | o48K 25.2M

Following Followers Likes

@mummy.to.karterjames

2,698 | 5,273 | 162.9K

Following Followers Likes

@thebabypt
28 = -378.6K | 5.91

Following Followers Likes



Unnamed: 0,name,username,follower_count,email,website,bio
0,Karen,kb_houston,,kbhouston32@gmail.com,,"mostly my cat, sometimes my life\nMl\nbad kbho..."
1,HoneyBloom Company,honeybloomcompany,,,www.thehoneybloomco.com,\Woman-owned\nFor the mothers)\nBump Friendly ...
2,Tiny Cabin Montessori,tiny.cabin.montessori,274K,brianna@tinycabinmontessori.com,,"Maria Montessori Approved\n® Seattle, WA\nbX) ..."
3,Victoria | First Time Mom,victoria.lynagh,1.1,bXivictoriabaronov@gmail.com,https://linktr.ee/victorialynagh,BFOS\n\nST PETE (yee\nunaesthetic motherhood j...
4,Brittany | Toddler Mom x2,brittdonner,15.2K,bibrittdonner18@gmail.com,https://linktr.ee/brittanydonner,SAHM to 2 Toddlers\nmotherhood & making things...
5,ne | Pregnancy After Lo...,hamilylife,,HamilyLife@outlook.com,https://gleam.io/6aGdq/leanne-x-naitre-gi...,*+-The Hammonds *s- |\nO5 The reality of my pr...
6,aft_edwards_firsttimemum,saf_edwards_firsttimemum,313.2K,,,Boy mum() dy &\nExpecting baby number 2) Two u...
7,‘Mama Nurse Tina*;-,mama_nurse_tina,,bumptolatch@gmail.com,https://www.bumptolatch.com,Childbirth + Pregnancy\nPersonal Thoughts\n\nC...
8,1a| Mummy to Karter&Bu...,mummy.to.karterjames,162.9K,Rceavecollabs@outlook.com,,2 under 3 pending &)Q)\n\nRceavecollabs@outloo...
9,"Dr. Olivia Reyes PT, DPT",thebabypt,,,,Developmental Physical Therapist\nDM “crawl” t...


In [153]:
path = 'photos_sample/IMG_8364.JPG'
# img = load_image(path)
# name, bio = crop_profile(img)
# cv2.imwrite("name.png", name)
# cv2.imwrite("bio.png", bio)

name_text, follower_text, bio_text = preprocessing_image(path)
print(name_text)
print('_' * 50)
print(follower_text)
print('_' * 50)
print(bio_text)

# entire_folder('photos_sample')

TraciDoula: Birth Mentor
she/her

__________________________________________________
4
1,134 174.K 4.7

__________________________________________________
Empowering you!
Keep your power ®:Real Talk on Resilience
PODCAST

> https://beacons.ai/tracidoula?fbclid=PAQ...

Instagram | &) Showcase | g? Subscription

