In [7]:
from PIL import Image, ImageOps, ImageFilter
import pytesseract
import re
import os
import cv2
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', None)

In [None]:
def load_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        raise FileNotFoundError(f"Image not found at {image_path}")
    return img

def crop_profile(image):
    height, width, _ = image.shape
    
    # name image
    name_top_crop = int(height * 0.07) 
    name_bottom_crop = int(height * 0.12)
    name_left_crop = int(width * 0.10)
    name_right_crop = int(width * 0.78)
    name_img = image[
        name_top_crop:name_bottom_crop, 
        name_left_crop:name_right_crop
    ]

    # follower image
    follower_top_crop = int(height * 0.255) 
    follower_bottom_crop = int(height * 0.34) #0.34
    follower_left_crop = int(width * 0.1)
    follower_right_crop = int(width * 0.9)
    follower_img = image[
        follower_top_crop:follower_bottom_crop,
        follower_left_crop:follower_right_crop
    ]
    
    # bio image
    bio_top_crop = int(height * 0.40)
    bio_bottom_crop = int(height * 0.55)
    bio_left_crop = int(width * 0.1)
    bio_right_crop = int(width * 0.9)
    bio_img = image[
        bio_top_crop:bio_bottom_crop,
        bio_left_crop:bio_right_crop
    ]
    return name_img, follower_img, bio_img

def to_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def denoise_image(image):
    # Applies median blurring for noise reduction
    # Kernel size 5 is common, adjust if needed
    # Tried 5, 3
    return cv2.medianBlur(image, 3)

def binarize_image(image):
    # Applies Otsu's thresholding for binarization
    # Use THRESH_OTSU for automatic threshold calculation
    # _, binary_img = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    _, binary_img = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)
    return binary_img

def upscale_image(image, scale_factor=2):
    # Upscales an image using linear interpolation
    return cv2.resize(image, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)

def binarize_image_adaptive(image):
    # ADAPTIVE_THRESH_GAUSSIAN_C or ADAPTIVE_THRESH_MEAN_C
    # blockSize: Size of a pixel neighborhood that is used to calculate a threshold value
    # C: Constant subtracted from the mean or weighted mean
    binary_img = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                       cv2.THRESH_BINARY, 11, 2) # Adjust 11 and 2
    return binary_img

def denoise_image_gaussian(image):
    return cv2.GaussianBlur(image, (3, 3), 0) # Small kernel, 0 sigma for automatic

def preprocess_pipeline(img):

    upscaled_img = upscale_image(img, scale_factor=2)
    gray_img = to_grayscale(upscaled_img)
    denoised_img = denoise_image_gaussian(gray_img)
    binary_img = binarize_image_adaptive(denoised_img)

    return binary_img

def image_extraction(name_img, follower_img, bio_img):
    extracted_information = {
        'name': None,
        'username': None,
        'follower_count': None,
        'email': None,
        'website': None,
        'bio': None,
        'IMG_num': None,
    }
    name_text = pytesseract.image_to_string(name_img, lang='eng')
    follower_text = pytesseract.image_to_string(follower_img, lang = 'eng')
    # config=' -c tessedit_char_whitelist=0123456789,.KM'
    bio_text = pytesseract.image_to_string(bio_img, lang='eng')

    name_pattern = re.compile(r'^(.*)')
    username_pattern = re.compile(r"@([a-zA-Z0-9._]+)")
    follower_count_pattern = re.compile(r'@[\w\.]+\s+[\d,|= -]+?\s+([\doOKkMm\.]+)\s+Following Followers Likes')
    email_pattern = re.compile(r"\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b")
    website_pattern = re.compile(r"(https?://[^\s]+|www\.[^\s]+)")  
    

    name_match = name_pattern.search(name_text)
    extracted_information['bio'] = bio_text.strip()
    if name_match:
        extracted_information['name'] = name_match.group(1).strip()
    username_match = username_pattern.search(follower_text)
    if username_match:
        extracted_information["username"] = username_match.group(1)
    follower_count_match = follower_count_pattern.search(follower_text)
    if follower_count_match:
        extracted_information["follower_count"] = follower_count_match.group(1)
    email_match = email_pattern.search(bio_text)
    if email_match:
        extracted_information["email"] = email_match.group(1)
    website_match = website_pattern.search(bio_text)
    if website_match:
        extracted_information["website"] = website_match.group(1)

    return extracted_information
    
def preprocessing_image(image_path):

    img = load_image(image_path)
    name_img, follower_img , bio_img = crop_profile(img)

    name_img = preprocess_pipeline(name_img)
    follower_img = preprocess_pipeline(follower_img)
    bio_img = preprocess_pipeline(bio_img)

    cv2.imwrite("preprocessed_name.png", name_img)
    cv2.imwrite('preprocessed_follower.png', follower_img)
    cv2.imwrite("preprocessed_bio.png", bio_img)

    # return image_extraction(name_img, bio_img)

    # cv2.imshow('name', name_img)
    # cv2.imshow('bio', bio_img)
    # name_text = pytesseract.image_to_string(name_img, lang='eng')
    # follower_text = pytesseract.image_to_string(follower_img, config='--psm 6 -c tessedit_char_whitelist=0123456789,.KM')
    # bio_text = pytesseract.image_to_string(bio_img, lang='eng')
    return name_img, follower_img, bio_img

def entire_folder(folder_path):
    df = pd.DataFrame(columns = ['name', 'username', 'follower_count', 'email', 'website', 'bio', 'IMG_num'])
    for file in os.listdir(folder_path):
        name, follower, bio = preprocessing_image(f'{folder_path}/{file}')
        extracted_info = image_extraction(name, follower, bio)
        extracted_info['IMG_num'] = file
        df.loc[len(df)] = extracted_info
        print(file)
    return df



In [22]:
df = entire_folder('photos_full')
df

IMG_0075.JPG
IMG_0076.JPG
IMG_0081.JPG
IMG_0084.JPG
IMG_0344.JPG
IMG_0346.JPG
IMG_0421.JPG
IMG_0423.JPG
IMG_0424.JPG
IMG_0437.JPG
IMG_0439.JPG
IMG_0440.JPG
IMG_0441.JPG
IMG_0442.JPG
IMG_0444.JPG
IMG_0446.JPG
IMG_0447.JPG
IMG_0448.JPG
IMG_0449.JPG
IMG_0450.JPG
IMG_0452.JPG
IMG_0590.JPG
IMG_0907.JPG
IMG_0932.JPG
IMG_0933.JPG
IMG_0934.JPG
IMG_0935.JPG
IMG_0936.JPG
IMG_0939.JPG
IMG_0940.JPG
IMG_0944.JPG
IMG_0946.JPG
IMG_0947.JPG
IMG_0951.JPG
IMG_0952.JPG
IMG_0954.JPG
IMG_0955.JPG
IMG_0961.JPG
IMG_0962.JPG
IMG_0980.JPG
IMG_0981.JPG
IMG_0998.JPG
IMG_1002.JPG
IMG_1003.JPG
IMG_1005.JPG
IMG_1007.JPG
IMG_1096.JPG
IMG_1097.JPG
IMG_1098.JPG
IMG_1249.JPG
IMG_1273.JPG
IMG_1275.JPG
IMG_1279.JPG
IMG_1321.JPG
IMG_1347.JPG
IMG_1375.JPG
IMG_1376.JPG
IMG_1377.JPG
IMG_1892.JPG
IMG_1896.JPG
IMG_1990.JPG
IMG_1994.JPG
IMG_1995.JPG
IMG_1999.JPG
IMG_2053.JPG
IMG_2068.JPG
IMG_2070.JPG
IMG_2075.JPG
IMG_2077.JPG
IMG_2080.JPG
IMG_2081.JPG
IMG_2085.JPG
IMG_2086.JPG
IMG_2104.JPG
IMG_2107.JPG
IMG_2110.JPG
IMG_2116.JPG

Unnamed: 0,name,username,follower_count,email,website,bio,IMG_num
0,TenishaCelesteC),pieface.8,,,https://tenisha47.gumroad.com/I/HowlGot...,Mrs. Haze) |\nHow | Got a Talent Agent for my ...,IMG_0075.JPG
1,pen,ben.trinh,,,,basically a daddy diary\nfirst time parent stu...,IMG_0076.JPG
2,Maple and Lark,mapleandlark,,,www.mapleandlark.com,Original Bliss Bins and Home Organization\n100...,IMG_0081.JPG
3,Alexa,alexa_weathers,,alexaclofine@gmail.com,https://linktr.ee/Alexa_weathers,"Wife, SAHM\nLifestyle, shopping, & cute baby c...",IMG_0084.JPG
4,Hypeancdvice,hypeandvice,,,https://www.hypeandvice.com,"| ee eee eee || a ne, se in\nNL a oon a\n\nCol...",IMG_0344.JPG
5,Michaela | IVF | SAHM | FTM,michaela_motherhood,,,https://msha.ke/michaela_motherhood,IVF MAMA | BOY MOM\nTikTok shop finds\nUGC cre...,IMG_0346.JPG
6,Korean mommy | elim &,elim.1,,reachforthemars@gmail.com,https://youtu.be/4_MqNRCoo8k?feature=...,Living life with Jesus Christ\n\nbX: reachfort...,IMG_0421.JPG
7,A Thousand Roses Store,athousandroses.store,,,,Supporting & Loving Mums & Bubs\n100% Natural ...,IMG_0423.JPG
8,Adi Kehl,adigraceeeee,,adigracee@thesociablesociety.com,https://stan.store/Adigrace,collabs/pr: adigracee@thesociablesociety.com\n...,IMG_0424.JPG
9,NIKKI BEAUTY,nikkibeautystudio,,,,Natural lash extension specialist\nLONG ISLAND...,IMG_0437.JPG


In [19]:
path = 'photos_sample/IMG_8351.JPG'
# img = load_image(path)
# name, bio = crop_profile(img)
# cv2.imwrite("name.png", name)
# cv2.imwrite("bio.png", bio)

name_text, follower_text, bio_text = preprocessing_image(path)
extracted_info = image_extraction(name_text, follower_text, bio_text)
print(extracted_info['name'])
print()
print(extracted_info['username'])
print(extracted_info['email'])
print(extracted_info['website'])
print(extracted_info['bio'])


# entire_folder('photos_sample')

Leanne | Pregnancy After Lo...

hamilylife
HamilyLife@outlook.com
https://gleam.io/6aGdq/leanne-x-naitre-gi...
*+-The Hammonds *s- |
O5 The reality of my pregnancy journey &
HamilyLife@outlook.com
@ https://gleam.io/6aGdq/leanne-x-naitre-gi...

(=) Showcase


In [24]:
df.to_csv('full_dataset.csv')