# Imports

In [59]:
from numpy.random import seed
seed(42)
import tensorflow as tf
tf.random.set_seed(42)

import html
from bs4 import BeautifulSoup
import requests
from requests.exceptions import Timeout
import pandas as pd
import numpy as np
import cupy as cp
from helium import * # if helium gives you chrome webdriver error, download latest chrome driver, and move it to here: .conda\Lib\site-packages\helium\_impl\webdrivers\windows ... source: https://github.com/mherrmann/selenium-python-helium/issues/55
import cv2
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import ChromeOptions

import string
import logging
from pprint import pprint
import regex as re
from multiprocessing import Process
import pickle
import os
import glob
import base64
from sys import getsizeof
import shutil
import json
import time
from datetime import datetime

from IPython.display import display, HTML
import ipywidgets
import heapq
import matplotlib.pyplot  as plt
from PIL import Image, ImageFile, ImageOps, ImageDraw
from PIL.ExifTags import TAGS
from pathlib import Path
import imagesize
import imghdr
import exifread
from matplotlib.widgets import LassoSelector
from matplotlib.path import Path as mplPath
import image_to_numpy
import pytesseract

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from plotly.offline import init_notebook_mode

from jupyter_dash import JupyterDash
from dash import dcc, html, Input, Output

import unicodedata2
import lzma
from tqdm.notebook import tqdm
from collections import Counter
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from functools import partial

from tensorflow import keras
from keras import optimizers, datasets, layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.python.keras import backend as K
import tensorflow_addons as tfa
from sklearn.metrics import classification_report 

# import easyocr
from retinaface import RetinaFace
from paddleocr import PaddleOCR, draw_ocr

import wordsegment
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import qalsadi.lemmatizer

# import download_imgs # used in getAsyncImgFunctions()

In [60]:
def pklSave(contentToBeSaved, fullPath):
    with open(fullPath, 'wb') as f:
        pickle.dump(contentToBeSaved, f)

def pklLoad(fullPath):
    with open(fullPath, 'rb') as f:
        content = pickle.load(f)
    return content

def pklForceLoad(path, dtype = 'dict'):
    try:
        content = pklLoad(path)
        return content
    except Exception as e:
        if dtype == 'list':
            pklSave([], path)
            return []
        else:
            pklSave({}, path)
            return {}

# more about naming standards for path components here: https://stackoverflow.com/questions/2235173/what-is-the-naming-standard-for-path-components
def joinPaths(baseDirectory, relativePath):
    return os.path.normpath(os.path.join(baseDirectory, relativePath))

def changeSep(path, newSep, oldSep=os.path.sep): 
    # source: https://stackoverflow.com/questions/18707338/print-raw-string-from-variable-not-getting-the-answers#:~:text=To%20turn%20a%20variable%20to%20raw%20str%2C%20just%20use
    return os.path.normpath(rf"{path}").replace(os.path.sep, newSep) # source: https://mail.python.org/pipermail/tutor/2011-July/084788.html

def pillowOpenOriented(path):
    img = Image.open(path)
    exif_data = img.getexif()
    if 274 in exif_data.keys():
        img = ImageOps.exif_transpose(img)
    return img

# Separating Screenshots From Photos Using get_highest_color_counts()

In [61]:
def combine_color_channels(R, G, B):
    return (R << 16) + (G << 8) + B

# draft (currently not used):
def extract_color_channels(color):
    R = (color >> 16) & 0xFF
    G = (color >> 8) & 0xFF
    B = color & 0xFF
    return (R, G, B)

def get_highest_color_counts(img, top_n=3):
    """
    Get a list of (count, color) tuples for each pixel color in the image

    Return Values' Notes:
    [(color_from_rgb_to_24_bits, highest_count), ..., ..., until top_n_frequent_colors]
    """
    
    # unique_colors = cupy_unique_axis0(img)
    # print(unique_colors[0])
    
    # to do: remove this code if you found gpu alternative
    try:
        unique_colors = img.getcolors(10000000) # Should never give overflow error if we didn't increase than 10000000, but just in case :]
        if unique_colors is None:
            img = img.resize((1000-1, 1000-1))
            unique_colors = img.getcolors(10000000)
    except OverflowError as e:
        unique_colors = img.getcolors(1000000)
    except MemoryError as e:
        print("Memory is full... returning (-1,0)")
        return (-1, 0)
    
    w, h = img.size

    # Find the top n colors with the highest count
    # Note that heapq.nlargest() is more efficient than: 
    # sorted(color_counts.items(), key=lambda x: x[1], reverse=True)[:top_n_frequent_colors]
    top_colors = heapq.nlargest(top_n, unique_colors, key=lambda x: x[0])
    
    top_n_color_and_color_to_image_ratio = []
    for count, color in top_colors:
        if isinstance(color, int) and color == 0:
            color = (0, 0, 0)
        r, g, b = color # # ignore the following comment (draft) --> ", *_" is used as sometimes Image.getcolors() returns 4 channels, the fourth representing alpha
        top_n_color = combine_color_channels(r,g,b)
        color_to_image_ratio = round(count/(w*h), 5)
        top_n_color_and_color_to_image_ratio.append((top_n_color, color_to_image_ratio))
    
    # Return the top N colors with their counts
    return top_n_color_and_color_to_image_ratio


In [63]:
# draft, debugging: seeing a good threshold to split screenshots from photos taken by a camera (this and next cell)
path_to_folder = '../dataset/70. eGreetingAndMisc/'
max_color_counts = []
# Create a dictionary to store the images
image_dict = {}

for filename in os.listdir(path_to_folder):
    # Check if the file is an image
    if filename.endswith('.jpg') or filename.endswith('.png') or filename.endswith('.jpeg'):
        # Open the image file
        img_path = os.path.join(path_to_folder, filename)
        img = Image.open(img_path) 
        img = img.convert('RGB')
        # img = img.resize((306,306))

        # Calculate the max color count and add it to the list
        print(filename)
        max_color_count = get_highest_color_counts(img, top_n=1)

        # Add the image to the dictionary with its max pixel value
        image_dict[filename] = {'max_color_count': max_color_count[0][1], 'image': img_path}
        # img.close()

eGreetingAndMisc0000000.jpg
eGreetingAndMisc0000001.jpg
eGreetingAndMisc0000002.jpg
eGreetingAndMisc0000003.jpg
eGreetingAndMisc0000004.jpg
eGreetingAndMisc0000005.jpg
eGreetingAndMisc0000006.jpg
eGreetingAndMisc0000007.jpg
eGreetingAndMisc0000008.jpg
eGreetingAndMisc0000009.jpg
eGreetingAndMisc0000010.jpg
eGreetingAndMisc0000011.jpg
eGreetingAndMisc0000012.jpg
eGreetingAndMisc0000013.jpg
eGreetingAndMisc0000014.jpg
eGreetingAndMisc0000015.jpg
eGreetingAndMisc0000016.jpg
eGreetingAndMisc0000017.jpg
eGreetingAndMisc0000018.jpg
eGreetingAndMisc0000019.jpg
eGreetingAndMisc0000020.jpg
eGreetingAndMisc0000021.jpg
eGreetingAndMisc0000022.jpg
eGreetingAndMisc0000023.jpg
eGreetingAndMisc0000024.jpg
eGreetingAndMisc0000025.jpg
eGreetingAndMisc0000026.jpg
eGreetingAndMisc0000027.jpg
eGreetingAndMisc0000028.jpg
eGreetingAndMisc0000029.jpg
eGreetingAndMisc0000030.jpg
eGreetingAndMisc0000031.jpg
eGreetingAndMisc0000032.jpg
eGreetingAndMisc0000033.jpg
eGreetingAndMisc0000034.jpg
eGreetingAndMisc0000

In [64]:
# Sort the list of tuples by the second value (max pixel value) in descending order
image_list = sorted(image_dict.items(), key=lambda x: x[1]['max_color_count'], reverse=True)

In [65]:
image_list

[('eGreetingAndMisc0005930.jpg',
  {'max_color_count': 0.97936,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0005930.jpg'}),
 ('eGreetingAndMisc0011471.jpg',
  {'max_color_count': 0.97057,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0011471.jpg'}),
 ('eGreetingAndMisc0007819.jpg',
  {'max_color_count': 0.96795,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0007819.jpg'}),
 ('eGreetingAndMisc0009124.jpg',
  {'max_color_count': 0.9671,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0009124.jpg'}),
 ('eGreetingAndMisc0012874.jpg',
  {'max_color_count': 0.96023,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0012874.jpg'}),
 ('eGreetingAndMisc0013086.jpg',
  {'max_color_count': 0.96023,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0013086.jpg'}),
 ('eGreetingAndMisc0004596.jpg',
  {'max_color_count': 0.95625,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0004596.jpg'}),
 ('eGreetingAn

In [66]:
len(image_list)

16205

In [67]:
# Create a hover widget that displays the image preview
def on_hover(change):
    filename = change['new']
    if filename in [f'{i}_{filename_and_dict[0]}' for i, filename_and_dict in enumerate(image_list)]:
        image = Image.open(image_dict[filename[filename.find('_')+1:]]['image'])
        display(image.resize((50,50)))

# Create a list widget that displays the filenames
list_widget = ipywidgets.Select(
    options= [f'{i}_{filename_and_dict[0]}' for i, filename_and_dict in enumerate(image_list)],
    description='Filenames:',
    layout=ipywidgets.Layout(width='50%')
)

# Link the hover widget to the list widget
list_widget.observe(on_hover, names='value')

# Create a VBox container for the widgets
container = ipywidgets.VBox([list_widget])

# Display the container
display(container)

VBox(children=(Select(description='Filenames:', layout=Layout(width='50%'), options=('0_eGreetingAndMisc000593…

In [75]:
# debugging:
# by manually checking the image's displays, we see that most of the images until index 936 are screenshots, so we'll move these images
for i, filename in enumerate([filename_and_dict[0] for filename_and_dict in image_list]):
    if i == -1:
        break
    src_path = joinPaths('../dataset/70. eGreetingAndMisc/', filename)
    dest_path = joinPaths('../dataset/70. eGreetingAndMisc/', f'{i}_{filename}')
    shutil.move(src_path, dest_path)

# Separating Text Messages, ESocialMediaPosts, And EMemes Using Text Properties

In [68]:
ImageFile.LOAD_TRUNCATED_IMAGES = True # to load very large images instead of getting an "image file is truncated" error
paddle_ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True) # need to run only once to download and load model into memory
paddle_ocr = PaddleOCR(use_angle_cls=True, lang='ar', use_gpu=True) # need to run only once to download and load model into memory
paddle_ocr = PaddleOCR(use_angle_cls=True, use_gpu=True, lang="ar", text_recognizer_cfg= {
                            'TextBoxBuilder': {
                                # 'min_size': 16,
                                'score_thresh': 0.8,
                                # 'nms_thresh': 0.2
                            }
                       }) # need to run only once to download and load model into memory
logging.getLogger('ppocr').setLevel(logging.WARNING) # to disable ppocr debug messages at each image input (e.g., [2023/03/14 06:41:22] ppocr DEBUG: dt_boxes num : 0, elapse : 0.014979839324951172)
os.environ["CUDA_VISIBLE_DEVICES"]="0"
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [69]:
def get_text_boxes_num(img):
    try:
        return len(paddle_ocr.ocr(np.array(img), det=True, rec=False, cls=True)[0])
    except:
        return 0

In [70]:
# draft, debugging: seeing a good threshold to split screenshots from photos taken by a camera (this and next cell)
path_to_folder = '../dataset/70. eGreetingAndMisc/'
max_color_counts = []
# Create a dictionary to store the images
image_dict_2 = {}

for filename in os.listdir(path_to_folder):
    # Check if the file is an image
    if filename.endswith('.jpg') or filename.endswith('.png') or filename.endswith('.jpeg'):
        # Open the image file
        img_path = os.path.join(path_to_folder, filename)
        img = Image.open(img_path) 
        img = img.convert('RGB')
        # img = img.resize((306,306))

        # Calculate the max color count and add it to the list
        print(filename)
        text_boxes_num = get_text_boxes_num(img)

        # Add the image to the dictionary with its max pixel value
        image_dict_2[filename] = {'text_boxes_num': text_boxes_num, 'image': img_path}
        # img.close()

eGreetingAndMisc0000000.jpg
eGreetingAndMisc0000001.jpg
eGreetingAndMisc0000002.jpg
eGreetingAndMisc0000003.jpg
eGreetingAndMisc0000004.jpg
eGreetingAndMisc0000005.jpg
eGreetingAndMisc0000006.jpg
eGreetingAndMisc0000007.jpg
eGreetingAndMisc0000008.jpg
eGreetingAndMisc0000009.jpg
eGreetingAndMisc0000010.jpg
eGreetingAndMisc0000011.jpg
eGreetingAndMisc0000012.jpg
eGreetingAndMisc0000013.jpg
eGreetingAndMisc0000014.jpg
eGreetingAndMisc0000015.jpg
eGreetingAndMisc0000016.jpg
eGreetingAndMisc0000017.jpg
eGreetingAndMisc0000018.jpg
eGreetingAndMisc0000019.jpg
eGreetingAndMisc0000020.jpg
eGreetingAndMisc0000021.jpg
eGreetingAndMisc0000022.jpg
eGreetingAndMisc0000023.jpg
eGreetingAndMisc0000024.jpg
eGreetingAndMisc0000025.jpg
eGreetingAndMisc0000026.jpg
eGreetingAndMisc0000027.jpg
eGreetingAndMisc0000028.jpg
eGreetingAndMisc0000029.jpg
eGreetingAndMisc0000030.jpg
eGreetingAndMisc0000031.jpg
eGreetingAndMisc0000032.jpg
eGreetingAndMisc0000033.jpg
eGreetingAndMisc0000034.jpg
eGreetingAndMisc0000

In [71]:
# Sort the list of tuples by the second value (max pixel value) in descending order
image_list_2 = sorted(image_dict_2.items(), key=lambda x: x[1]['text_boxes_num'], reverse=True)

In [72]:
image_list_2

[('eGreetingAndMisc0013440.jpg',
  {'text_boxes_num': 75,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0013440.jpg'}),
 ('eGreetingAndMisc0004829.jpg',
  {'text_boxes_num': 61,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0004829.jpg'}),
 ('eGreetingAndMisc0013228.jpg',
  {'text_boxes_num': 57,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0013228.jpg'}),
 ('eGreetingAndMisc0013266.jpg',
  {'text_boxes_num': 57,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0013266.jpg'}),
 ('eGreetingAndMisc0012832.jpg',
  {'text_boxes_num': 51,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0012832.jpg'}),
 ('eGreetingAndMisc0012289.jpg',
  {'text_boxes_num': 50,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0012289.jpg'}),
 ('eGreetingAndMisc0002153.jpg',
  {'text_boxes_num': 49,
   'image': '../dataset/70. eGreetingAndMisc/eGreetingAndMisc0002153.jpg'}),
 ('eGreetingAndMisc0010600.jpg',
  {'text_boxes_num': 4

In [73]:
len(image_list_2)

16205

In [74]:
# Create a hover widget that displays the image preview
def on_hover(change):
    filename = change['new']
    if filename in [f'{i}_{filename_and_dict[0]}' for i, filename_and_dict in enumerate(image_list_2)]:
        image = Image.open(image_dict_2[filename[filename.find('_')+1:]]['image'])
        display(image.resize((70,70)))

# Create a list widget that displays the filenames
list_widget = ipywidgets.Select(
    options= [f'{i}_{filename_and_dict[0]}' for i, filename_and_dict in enumerate(image_list_2)],
    description='Filenames:',
    layout=ipywidgets.Layout(width='50%')
)

# Link the hover widget to the list widget
list_widget.observe(on_hover, names='value')

# Create a VBox container for the widgets
container = ipywidgets.VBox([list_widget])

# Display the container
display(container)

VBox(children=(Select(description='Filenames:', layout=Layout(width='50%'), options=('0_eGreetingAndMisc001344…

In [76]:
# debugging:
# by manually checking the image's displays, we see that most of the images until index 936 are screenshots, so we'll move these images
for i, filename in enumerate([filename_and_dict[0] for filename_and_dict in image_list_2]):
    if i == -1:
        break
    src_path = joinPaths('../dataset/70. eGreetingAndMisc/', filename)
    dest_path = joinPaths('../dataset/70. eGreetingAndMisc/', f'{i}_{filename}')
    try:
        shutil.move(src_path, dest_path)
    except Exception as e:
        print(e)

[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0013440.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0012832.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0012289.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0002629.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0002616.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0015643.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0002554.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0015154.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreetingAndMisc0002668.jpg'
[Errno 2] No such file or directory: '..\\dataset\\70. eGreetingAndMisc\\eGreeting