part 3 is for exploratory data analysis (EDA) of classes which have less scores than other respective classes & adjusting these classes' images respectively

# Imports

In [80]:
from numpy.random import seed
seed(42)
import tensorflow as tf
tf.random.set_seed(42)

import pandas as pd
import numpy as np

import sys
import string
import logging
import regex as re
import pickle
import os
import glob
import base64

from IPython.display import display
import matplotlib.pyplot  as plt
from PIL import Image, ImageFile, ImageOps, ImageDraw
from pathlib import Path

import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from plotly.offline import init_notebook_mode

from tqdm.notebook import tqdm
from collections import Counter
from keras_preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from functools import partial

import fastdup

# Global Variables & Functions

In [10]:
dataDir = 'dataset/'
dataReDir = 'dataset_related/'
imgTypes = list(filter(lambda v: re.match('\d{1,3}\. ', v), os.listdir(dataDir)))
imgTypes

['00. selfies',
 '10. fmemes',
 '20. ememes',
 '30. fSocialMedia',
 '40. eSocialMedia',
 '50. fTxtMssgs',
 '70. eGreetingAndMisc',
 '81. academicPhotos',
 '82. academicDigital']

In [11]:
def pklSave(contentToBeSaved, fullPath):
    with open(fullPath, 'wb') as f:
        pickle.dump(contentToBeSaved, f)

def pklLoad(fullPath):
    with open(fullPath, 'rb') as f:
        content = pickle.load(f)
    return content

def pklForceLoad(path, dtype = 'dict'):
    try:
        content = pklLoad(path)
        return content
    except Exception as e:
        if dtype == 'list':
            pklSave([], path)
            return []
        else:
            pklSave({}, path)
            return {}

# more about naming standards for path components here: https://stackoverflow.com/questions/2235173/what-is-the-naming-standard-for-path-components
def joinPaths(baseDirectory, relativePath):
    return os.path.normpath(os.path.join(baseDirectory, relativePath))

def changeSep(path, newSep, oldSep=os.path.sep): 
    # source: https://stackoverflow.com/questions/18707338/print-raw-string-from-variable-not-getting-the-answers#:~:text=To%20turn%20a%20variable%20to%20raw%20str%2C%20just%20use
    return os.path.normpath(rf"{path}").replace(os.path.sep, newSep) # source: https://mail.python.org/pipermail/tutor/2011-July/084788.html

def pillowOpenOriented(path):
    img = Image.open(path)
    return ImageOps.exif_transpose(img)

## Setting up images/Labels' dictionaries and the metadata csv file

In [12]:
imgTypeToNum = pklLoad('./dataset_related./imgTypeToNum.pickle')
numToImgType = {}
for  i, type in enumerate(imgTypes):
    imgTypeToNum[type] = i
    numToImgType[i] = type
imgTypeToNum

{'00. selfies': 0,
 '10. fmemes': 1,
 '20. ememes': 2,
 '30. fSocialMedia': 3,
 '40. eSocialMedia': 4,
 '50. fTxtMssgs': 5,
 '70. eGreetingAndMisc': 6,
 '81. academicPhotos': 7,
 '82. academicDigital': 8}

In [13]:
imgs_metadata_v2 = pd.read_csv('./dataset_related/imgsPropsv2.csv')

def combine_paths(row):
    return joinPaths(row['relative_path'], row['file_name'])
imgs_metadata_v2['full_path'] = imgs_metadata_v2.apply(combine_paths, axis=1)
imgs_metadata_v2.set_index('full_path', inplace=True)

imgs_metadata_v2.sample(5)

# side note: the "mixed types" warnings are for 
# Index(['ar_words_original', 'ar_words_lemmatized', 'ar_words_including_separators'], dtype='object')
# columns, which currently don't interest us, so we'll neglect this warning for now.


Columns (25,28,30) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0_level_0,file_name,relative_path,aspect_ratio,area,width,height,dominant_color_1,color_to_image_ratio_1,dominant_color_2,color_to_image_ratio_2,...,en_words_segmented_and_lemmatized,ar_words_lemmatized,en_words_including_separators,ar_words_including_separators,en_words_to_boxes,ar_words_to_boxes,en_avg_score,ar_avg_score,math_digits_or_symbols_count,class
full_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dataset\40. eSocialMedia\eSocialMedia0000875.jpg,eSocialMedia0000875.jpg,dataset/40. eSocialMedia,2.31,297252,828,359,16777215,0.70621,16711422,0.05507,...,muhammed usseirdtnooxgonboysert,اتجوزيني وهبقى أزمة انا معنديش أزمة هعملك وهبق...,Muhammed usseir || 2d ||| TnooX || Gonboysert,اتجوزيني وهبقى وقت الأزمات || بس انا معنديش أز...,"{'Muhammed': [[[123.0, 62.0], [286.0, 62.0], [...","{'الأزمات': [[[117.0, 147.0], [243.0, 142.0], ...",0.72223,0.98994,0,40. eSocialMedia
dataset\10. fmemes\fmemes0010267.jpg,fmemes0010267.jpg,dataset/10. fmemes,1.0,250000,500,500,16777215,0.04882,0,0.0279,...,ira lpa belloni de ouemadosreirtedeellosne gen...,,IR ALPABELLONIDEOUEMADOS ||| REIRTEDEELLOS || ...,,"{'IR ALPABELLONIDEOUEMADOS': [[[17.0, 11.0], [...",{},0.95616,0.0,0,10. fmemes
dataset\10. fmemes\fmemes0009228.jpg,fmemes0009228.jpg,dataset/10. fmemes,1.5,166500,500,333,16777215,0.06508,0,0.03053,...,dotvneoneoesehsiamasloha gas gratis,,DOTVNEONEOESEHS ||| IAMAS LO HAGAS GRATIS,,"{'DOTVNEONEOESEHS': [[[42.0, 11.0], [457.0, 13...",{},0.79763,0.0,0,10. fmemes
dataset\00. selfies\selfies0022023.jpg,selfies0022023.jpg,dataset/00. selfies,1.0,93636,306,306,16777215,0.37274,16711679,0.00578,...,,,,,{},{},0.0,0.0,0,00. selfies
dataset\70. eGreetingAndMisc\eGreetingAndMisc0006165.jpg,eGreetingAndMisc0006165.jpg,dataset/70. eGreetingAndMisc,2.1,50530,326,155,3876,0.3565,3878,0.00671,...,,مبار محتوى عيدكه عام خير,,مبارك محتويات || عيدكه || وكل عام وأنتم بخير,{},"{'محتويات': [[[281.0, 2.0], [318.0, 2.0], [318...",0.0,0.98155,0,70. eGreetingAndMisc


# Further Refinement of `academicDigital` and `fSocialMedia` Datasets

As we can see from the best model scores table in `comparing_models_results.ipynb` (before introducing models v3.x):

<img src='./project_media/best_scores_table_before_v3.png' width=400 />

the two most classes that we should aim to improve are `academicDigital` and `fSocialMedia` classes

## Functions to Measure & Order Images 

In [14]:
def get_image_ratio(img_or_path):
    if isinstance(img_or_path, str):
        img = Image.open(img_or_path)
    else:
        img = img_or_path
    width, height = img.size
    ratio = round(float(width) / float(height), 3)
    return ratio

# draft: use this only if you don't have a csv containing img's metadata 
# (which in my case, got created in dataset_preprocessing_part_2.ipynb)
def get_text_boxes_num(img_or_path):
    if isinstance(img_or_path, str):
        img = Image.open(img_or_path)
    else:
        img = img_or_path

    # import check
    if 'paddleocr' in sys.modules:
        from paddleocr import PaddleOCR
        paddle_ocr = PaddleOCR(use_angle_cls=True, use_gpu=True, lang="en")
        paddle_ocr = PaddleOCR(use_angle_cls=True, use_gpu=True, lang="ar")

    try:
        return len(paddle_ocr.ocr(np.array(img), det=True, rec=False, cls=True)[0])
    except:
        # then '[0]' didn't work as no boxes were found, so return 0
        return 0

def num_detected_words(img_path):
    global imgs_metadata_v2
    return imgs_metadata_v2.loc[img_path, 'total_en_boxes'] + imgs_metadata_v2.loc[img_path, 'total_ar_boxes']

## Functions to Manage Images' Removal

In [148]:
from typing import Callable, Union

def _format_num(x):
    return format(x, '07d')

def prefix_imgs_by_score(dir:str, scoring_func:Callable[[Union[str, Image.Image]], float], asc=False):
    '''
    in images inside `dir`, their names will be like:
    0_img004.jpg, 1_img007.jpg, 2_img002.jpg, ...
    of course, the `img004`/etc part is not exact, but depends on your original 
    naming structure of the images 
    '''
    # Get a list of image filenames in the directory
    filenames = sorted(os.listdir(dir))
    # Calculate scores for each image using the scoring function
    scores = [(filename, scoring_func(joinPaths(dir, filename))) for filename in filenames]

    # Sort the filenames based on the scores in descending order
    sorted_old_filenames_with_scores = sorted(scores, key=lambda x: x[1], reverse=(not asc))
    # side note: used _format_num() instead of directly using "i" to maintain structure 
    # (helps when we want to sort list by string)
    sorted_filenames = [f"{_format_num(i)}_{filename}" for i, (filename, _) in enumerate(sorted_old_filenames_with_scores)]

    # map each prefix number (e.g., '0' in '0_', etc) to filename
    # to be possibly used later in rename_imgs()
    prefix_to_filename_with_prefix = {i : filename for i, filename in enumerate(sorted_filenames)}

    # Rename the files with the sorted filenames
    for (old_filename, _), new_filename in zip(sorted_old_filenames_with_scores, sorted_filenames):
        old_path = joinPaths(dir, old_filename)
        new_path = joinPaths(dir, new_filename)
        os.rename(old_path, new_path)
    
    return prefix_to_filename_with_prefix

def _alter_orig_filename_func(filename:str, id:int):
    '''
    Example:
    'asdf/dfklje/eSocialMedia0000007.jpg'
    will be:
    'asdf/dfklje/eSocialMedia0000100.jpg'
    if id = 100, and so on...
    '''
    return re.sub(r'(\d+)(?=\.[^.]+$)', _format_num(id), filename)

def removing_imgs_prefix(dir:str, old_prefix_to_filename:dict, alter_orig_filename=False):
    '''
    this function should be called after you manually/automatically 
    remove the unwanted images from `dir`. it does the following:
    stores the removed filenames (without the prefix, i.e., "0_") and returns that list of names
    Furthermore, in images inside `dir`, the filenames will change from:
    0_img004.jpg, 1_img007.jpg, 2_img002.jpg, ...
    to this if alter_orig_filename=False:
    img004.jpg,   img007.jpg,    img002.jpg, ... IF 
    else:
    img001.jpg,   img002.jpg,    img003.jpg, ... IF alter_orig_filename=True

    of course, the `img004`/etc part is not exact, but depends on your original 
    naming structure of the images
    '''
    # Get a list of image filenames in the directory
    # and the final prefix value that was set by prefix_imgs_by_score() 
    # which should be equal to the number of images in dir - 1 (as 0-based prefix is used)
    filenames = sorted(os.listdir(dir))
    final_prefix = max(old_prefix_to_filename.keys())
    
    old_paths, new_paths = [], []
    removed_filenames = []
    
    # contains new prefix/filename pairs (after removing unwanted images from `dir`)
    updated_prefix_to_filename = {int(filename_with_prefix.split('_', 1)[0]) : filename_with_prefix.split('_', 1)[1]
                                  for filename_with_prefix in filenames}
    
    # `old_prefix_to_filename` contains all prefix/filename pairs (even those that you removed from `dir`)
    for old_prefix in old_prefix_to_filename.keys():
        filename = updated_prefix_to_filename.get(old_prefix)
        if filename is None:
            removed_filenames.append(old_prefix_to_filename[old_prefix])
        else:
            old_paths.append(joinPaths(dir, _format_num(old_prefix) + '_' + filename))
            new_paths.append(joinPaths(dir, filename))

    # removing prefixes from filenames in `dir`
    for i in range(len(new_paths)):
        if alter_orig_filename:
            # changing id of filename to be sequential (check below function's docstring for example)
            new_paths[i] = _alter_orig_filename_func(new_paths[i], i)
        os.rename(old_paths[i], new_paths[i])
    
    return removed_filenames

def update_metadata_df(df, removed_filenames, have_prefix=True, full_path=False, alter_orig_filename=False) -> pd.DataFrame:
    if full_path:
        removed_filenames = [os.path.basename(fn) for fn in removed_filenames]
    if have_prefix:
        removed_filenames_without_prefix = [fn.split('_', 1)[1] for fn in removed_filenames]
    else:
        removed_filenames_without_prefix = removed_filenames
    # Removing rows based on condition on column 'filename'
    condition = ~df['file_name'].isin(removed_filenames_without_prefix)
    df = df[condition]
    if alter_orig_filename:
        # TODO: adjust df if filenames (including id) have been altered (out of current scope)
        pass
    return df 

def remove_imgs(filepaths_to_remove:list[str]):
    for fp in filepaths_to_remove:
        os.remove(fp)
    return filepaths_to_remove

## Filtering `academicDigital` Dataset

### By Ratio

In [56]:
class_path = joinPaths(dataDir, '82. academicDigital/')

prefix_to_filename_with_prefix = prefix_imgs_by_score(class_path, get_image_ratio, asc=False)

In [64]:
# removed_filenames are the unwanted images that you manually removed from `dir`
removed_filenames = removing_imgs_prefix(class_path, prefix_to_filename_with_prefix)
removed_filenames

['0000000_academicDigital0000045.jpg',
 '0000001_academicDigital0000416.jpg',
 '0000002_academicDigital0001023.jpg',
 '0000003_academicDigital0000641.jpg',
 '0000004_academicDigital0000071.jpg',
 '0000005_academicDigital0000073.jpg',
 '0000006_academicDigital0000082.jpg',
 '0000007_academicDigital0000106.jpg',
 '0000008_academicDigital0000081.jpg',
 '0000009_academicDigital0000084.jpg',
 '0000010_academicDigital0000673.jpg',
 '0000011_academicDigital0000367.jpg',
 '0000012_academicDigital0000368.jpg',
 '0000013_academicDigital0000143.jpg',
 '0000014_academicDigital0000046.jpg',
 '0000015_academicDigital0000792.jpg',
 '0000016_academicDigital0001010.jpg',
 '0000017_academicDigital0001516.jpg',
 '0000018_academicDigital0001358.jpg',
 '0000019_academicDigital0000313.jpg',
 '0000020_academicDigital0000154.jpg',
 '0000021_academicDigital0001196.jpg',
 '0000022_academicDigital0000720.jpg',
 '0000023_academicDigital0001586.jpg',
 '0000024_academicDigital0000356.jpg',
 '0000025_academicDigital

In [76]:
imgs_metadata_v3_ratio = update_metadata_df(imgs_metadata_v2, removed_filenames)
imgs_metadata_v3_ratio[imgs_metadata_v3_ratio['class'] == '82. academicDigital'].shape

(1626, 37)

Before: 1979 --> after: 1626

sample of images with high ratio removed (total: 293):

<img src='./project_media/academicDigital high ratio.png' width=1100 />


sample of images with low ratio removed (total: 60):

<img src='./project_media/academicDigital low ratio.png' width=1100 />

### By #words

In [79]:
prefix_to_filename_with_prefix = prefix_imgs_by_score(class_path, num_detected_words, asc=False)

In [82]:
# removed_filenames are the unwanted images that you manually removed from `dir`
removed_filenames = removing_imgs_prefix(class_path, prefix_to_filename_with_prefix)
removed_filenames

['0000729_academicDigital0001444.jpg',
 '0000883_academicDigital0001478.jpg',
 '0000925_academicDigital0000582.jpg',
 '0000927_academicDigital0000978.jpg',
 '0000947_academicDigital0000583.jpg',
 '0000959_academicDigital0000469.jpg',
 '0000979_academicDigital0000563.jpg',
 '0001013_academicDigital0001107.jpg',
 '0001030_academicDigital0001060.jpg',
 '0001044_academicDigital0000809.jpg',
 '0001179_academicDigital0001325.jpg',
 '0001186_academicDigital0000670.jpg',
 '0001332_academicDigital0000472.jpg',
 '0001344_academicDigital0000980.jpg',
 '0001345_academicDigital0001106.jpg',
 '0001423_academicDigital0000119.jpg',
 '0001424_academicDigital0000744.jpg',
 '0001425_academicDigital0000868.jpg',
 '0001427_academicDigital0000910.jpg',
 '0001428_academicDigital0000913.jpg',
 '0001429_academicDigital0000991.jpg',
 '0001434_academicDigital0000095.jpg',
 '0001437_academicDigital0000537.jpg',
 '0001439_academicDigital0000912.jpg',
 '0001442_academicDigital0001189.jpg',
 '0001446_academicDigital

In [84]:
imgs_metadata_v3_ratio_texts = update_metadata_df(imgs_metadata_v3_ratio, removed_filenames)
imgs_metadata_v3_ratio_texts[imgs_metadata_v3_ratio_texts['class'] == '82. academicDigital'].shape

(1422, 37)

Before: 1626 --> after: 1422

sample of images with few words (total: 204):

<img src='./project_media/academicDigital few words.png' width=1100 />


### By `fastdup` Functions

In [135]:
work_dir = './dataset_related/fastdup_work_dir_academicDigital'
input_dir = joinPaths(dataDir, '82. academicDigital/')

In [137]:
fd = fastdup.create(work_dir, input_dir)
fd.run()

FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.

 ########################################################################################

Dataset Analysis Summary: 

    Dataset contains 1422 images
    Valid images are 100.00% (1,422) of the data, invalid are 0.00% (0) of the data
    Similarity:  20.82% (296) belong to 19 similarity clusters (components).
    79.18% (1,126) images do not belong to any similarity cluster.
    Largest cluster has 554 (38.96%) images.
    For a detailed analysis, use `.connected_components()`
(similarity threshold used is 0.9, connected component threshold used is 0.96).

    Outliers: 5.84% (83) of images are possible outliers, and fall in the bottom 5.00% of similarity values.
    For a detailed list of outliers, use `.outliers()`.


In [139]:
fastdup.run_kmeans(input_dir, work_dir, num_clusters=10, nearest_neighbors_k=1, high_accuracy=True)

FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.


0

In [140]:
fastdup.create_kmeans_clusters_gallery(work_dir, save_path='./dataset_related/fastdup_academicDigital_kmeans')

100%|██████████| 10/10 [00:07<00:00,  1.34it/s]


Finished OK. Components are stored as image files ./dataset_related/fastdup_academicDigital_kmeans/components_[index].jpg
Stored kmeans clusters visual view in  ./dataset_related/fastdup_academicDigital_kmeans\components.html
Execution time in seconds 8.9


0

In [141]:
kmeans_df = pd.read_csv(joinPaths(work_dir, 'kmeans_assignments.csv'))

In [144]:
kmeans_cluster_filenames = kmeans_df.groupby('cluster').agg({'filename': list, 'distance': max})
cluster_to_filenames = {}
for cluster_idx in kmeans_cluster_filenames.index:
    cluster_to_filenames[cluster_idx] = kmeans_cluster_filenames.loc[cluster_idx, 'filename']

total = 0
for cluster_idx, filenames in cluster_to_filenames.items():
    print(cluster_idx, len(filenames))
    total += len(filenames)
print(total)

0 128
1 234
2 222
3 90
4 113
5 252
6 48
7 87
8 242
9 6
1422


Before: 1422 --> after: 1368

Checking out cluster 6 and 9's images from `___/components.html`, 
we see that these images are actually taken by mobile camera, not digitally, so they'll be removed:

<img src='./project_media/academicDigital should be academicPhotos.png' width=1000 />

In [151]:
filepaths_to_remove = cluster_to_filenames[6] + cluster_to_filenames[9]
len(filepaths_to_remove)

54

In [152]:
removed_filepaths = remove_imgs(cluster_to_filenames[6] + cluster_to_filenames[9])

In [153]:
imgs_metadata_v3_acdig = update_metadata_df(imgs_metadata_v3_ratio_texts, removed_filepaths, have_prefix=False, full_path=True)
imgs_metadata_v3_acdig[imgs_metadata_v3_acdig['class'] == '82. academicDigital'].shape

(1368, 37)

## TODO: Filtering `fSocialMedia` Dataset

### By Ratio

In [None]:
class_path = joinPaths(dataDir, '30. fSocialMedia/')

prefix_to_filename_with_prefisx = prefix_imgs_by_score(class_path, get_image_ratio, asc=False)

In [None]:
# removed_filenames are the unwanted images that you manually removed from `dir`
removed_filenames = removing_imgs_prefix(class_path, prefix_to_filename_with_prefix)
removed_filenames

['0000000_academicDigital0000045.jpg',
 '0000001_academicDigital0000416.jpg',
 '0000002_academicDigital0001023.jpg',
 '0000003_academicDigital0000641.jpg',
 '0000004_academicDigital0000071.jpg',
 '0000005_academicDigital0000073.jpg',
 '0000006_academicDigital0000082.jpg',
 '0000007_academicDigital0000106.jpg',
 '0000008_academicDigital0000081.jpg',
 '0000009_academicDigital0000084.jpg',
 '0000010_academicDigital0000673.jpg',
 '0000011_academicDigital0000367.jpg',
 '0000012_academicDigital0000368.jpg',
 '0000013_academicDigital0000143.jpg',
 '0000014_academicDigital0000046.jpg',
 '0000015_academicDigital0000792.jpg',
 '0000016_academicDigital0001010.jpg',
 '0000017_academicDigital0001516.jpg',
 '0000018_academicDigital0001358.jpg',
 '0000019_academicDigital0000313.jpg',
 '0000020_academicDigital0000154.jpg',
 '0000021_academicDigital0001196.jpg',
 '0000022_academicDigital0000720.jpg',
 '0000023_academicDigital0001586.jpg',
 '0000024_academicDigital0000356.jpg',
 '0000025_academicDigital

In [None]:
imgs_metadata_v3_ratio_2 = update_metadata_df(imgs_metadata_v3_acdig, removed_filenames)
imgs_metadata_v3_ratio_2[imgs_metadata_v3_ratio_2['class'] == '82. academicDigital'].shape

(1626, 37)

Before: 1979 --> after: 1626

sample of images with high ratio removed (total: 293):

<img src='./project_media/.png' width=1100 />


sample of images with low ratio removed (total: 60):

<img src='./project_media/.png' width=1100 />

### By #words

In [None]:
prefix_to_filename_with_prefix = prefix_imgs_by_score(class_path, num_detected_words, asc=False)

In [None]:
# removed_filenames are the unwanted images that you manually removed from `dir`
removed_filenames = removing_imgs_prefix(class_path, prefix_to_filename_with_prefix)
removed_filenames

['0000729_academicDigital0001444.jpg',
 '0000883_academicDigital0001478.jpg',
 '0000925_academicDigital0000582.jpg',
 '0000927_academicDigital0000978.jpg',
 '0000947_academicDigital0000583.jpg',
 '0000959_academicDigital0000469.jpg',
 '0000979_academicDigital0000563.jpg',
 '0001013_academicDigital0001107.jpg',
 '0001030_academicDigital0001060.jpg',
 '0001044_academicDigital0000809.jpg',
 '0001179_academicDigital0001325.jpg',
 '0001186_academicDigital0000670.jpg',
 '0001332_academicDigital0000472.jpg',
 '0001344_academicDigital0000980.jpg',
 '0001345_academicDigital0001106.jpg',
 '0001423_academicDigital0000119.jpg',
 '0001424_academicDigital0000744.jpg',
 '0001425_academicDigital0000868.jpg',
 '0001427_academicDigital0000910.jpg',
 '0001428_academicDigital0000913.jpg',
 '0001429_academicDigital0000991.jpg',
 '0001434_academicDigital0000095.jpg',
 '0001437_academicDigital0000537.jpg',
 '0001439_academicDigital0000912.jpg',
 '0001442_academicDigital0001189.jpg',
 '0001446_academicDigital

In [None]:
imgs_metadata_v3_ratio_texts_2 = update_metadata_df(imgs_metadata_v3_ratio_2, removed_filenames)
imgs_metadata_v3_ratio_texts_2[imgs_metadata_v3_ratio_texts_2['class'] == '30. fSocialMedia'].shape

(1422, 37)

Before: 1626 --> after: 1422

sample of images with few words (total: 204):

<img src='./project_media/.png' width=1100 />


### By `fastdup` Functions

In [None]:
work_dir = './dataset_related/fastdup_work_dir_fSocialMedia'
input_dir = joinPaths(dataDir, '30. fSocialMedia/')

In [None]:
fd = fastdup.create(work_dir, input_dir)
fd.run()

FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.

 ########################################################################################

Dataset Analysis Summary: 

    Dataset contains 1422 images
    Valid images are 100.00% (1,422) of the data, invalid are 0.00% (0) of the data
    Similarity:  20.82% (296) belong to 19 similarity clusters (components).
    79.18% (1,126) images do not belong to any similarity cluster.
    Largest cluster has 554 (38.96%) images.
    For a detailed analysis, use `.connected_components()`
(similarity threshold used is 0.9, connected component threshold used is 0.96).

    Outliers: 5.84% (83) of images are possible outliers, and fall in the bottom 5.00% of similarity values.
    For a detailed list of outliers, use `.outliers()`.


In [None]:
fastdup.run_kmeans(input_dir, work_dir, num_clusters=10, nearest_neighbors_k=1, high_accuracy=True)

FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.


0

In [None]:
fastdup.create_kmeans_clusters_gallery(work_dir, save_path='./dataset_related/fastdup_fSocialMedia_kmeans')

100%|██████████| 10/10 [00:07<00:00,  1.34it/s]


Finished OK. Components are stored as image files ./dataset_related/fastdup_academicDigital_kmeans/components_[index].jpg
Stored kmeans clusters visual view in  ./dataset_related/fastdup_academicDigital_kmeans\components.html
Execution time in seconds 8.9


0

In [None]:
kmeans_df = pd.read_csv(joinPaths(work_dir, 'kmeans_assignments.csv'))

In [None]:
kmeans_cluster_filenames = kmeans_df.groupby('cluster').agg({'filename': list, 'distance': max})
cluster_to_filenames = {}
for cluster_idx in kmeans_cluster_filenames.index:
    cluster_to_filenames[cluster_idx] = kmeans_cluster_filenames.loc[cluster_idx, 'filename']

total = 0
for cluster_idx, filenames in cluster_to_filenames.items():
    print(cluster_idx, len(filenames))
    total += len(filenames)
print(total)

0 128
1 234
2 222
3 90
4 113
5 252
6 48
7 87
8 242
9 6
1422


Before: 1422 --> after: 1368

Checking out cluster 6 and 9's images from `___/components.html`, 
we see that these images are actually taken by mobile camera, not digitally, so they'll be removed:

<img src='./project_media/.png' width=1000 />

In [None]:
filepaths_to_remove = cluster_to_filenames[6] + cluster_to_filenames[9]
len(filepaths_to_remove)

54

In [None]:
removed_filepaths = remove_imgs(cluster_to_filenames[6] + cluster_to_filenames[9])

In [None]:
imgs_metadata_v3 = update_metadata_df(imgs_metadata_v3_ratio_texts_2, removed_filepaths, have_prefix=False, full_path=True)
imgs_metadata_v3[imgs_metadata_v3['class'] == '30. fSocialMedia'].shape

(1368, 37)

## TODO: Saving New `ImgProps` & `train/val/test` CSVs