In [1]:
import pandas as pd
import seaborn as sns
import os
import sys
import re
import concurrent.futures

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from collections import defaultdict
from dataclasses import dataclass, asdict

sys.path.append("../")

from models.work_met import Work_met

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [2]:
def get_images(dir: str, exclusion_list: list | None = None):
    """Extracts  the 'work names' of the meteorites based on the pictures loaded."""
    images = [file for file in os.listdir(dir) if str(file).endswith(".jpg")]
    if exclusion_list is not None:
        for excluded in exclusion_list:
            images.remove(excluded)

    return images


def get_work_names(image_list) -> dict:
    """Returns a dict with the expected 'work names' of the meteorites' images' names in image_list."""

    wname_fname_map = defaultdict(list)  # avoids checking for .keys() and initiates items as lists
    for image_name in image_list:
        # Remove file extension and parts after - or _
        work_name = image_name.split(".")[0].split("-")[0].split("_")[0].split("+")[0]
        
        # Clean up the work name if there are more than one alpha character after the last digit
        if not re.search(r'\d[a-zA-Z]{0,1}$', work_name):
            work_name = re.sub(r'(\d)[a-zA-Z]*$', r'\1', work_name)
        
        # Append the image name to the list associated with the work name
        wname_fname_map[work_name].append(image_name)

    return dict(wname_fname_map)



In [3]:
images_paths = get_images(dir="../imgs/")

print(f"{len(images_paths)} images detected")


407 images detected


In [4]:
work_names = get_work_names(image_list=images_paths)

print(f"{len(work_names)} meteorites detected")


142 meteorites detected


In [5]:
work_met_list = []
for name in work_names.keys():
    work_met_list.append(Work_met(work_name=name, images=work_names[name]))


In [6]:
def get_types_threads(work_met_list, n_threads = 25):
    """
    Uses multithreading to perform concurrent requests via the method Work_met.request_type()
    """
    if n_threads > 25:
        print(f"Thread limit of {n_threads} > 25, reducing it to 25 for fair use of the app")
        n_threads = 25

    for work_met in work_met_list:
        if not isinstance(work_met, Work_met):
            raise TypeError(f"At least one of the objects ({work_met}) is not of the class Meteorite")

    with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
        futures = [executor.submit(work_met.request_type) for work_met in work_met_list]

        # Hopefully no error but this should be useful for debug
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"Error occurred: {e}")


In [7]:
get_types_threads(work_met_list=work_met_list, n_threads=25)


In [8]:
extract_failures = []

for work_met in work_met_list:
    try:
        if work_met.mtype is not None:
            pass
        else:
            extract_failures.append(work_met.work_name)
    except AttributeError:
        extract_failures.append(work_met.work_name)


In [9]:
print(extract_failures.__len__())


21


In [10]:
@dataclass
class Work_met_data:
    work_name: str
    mtype: str
    images: list

data_work_met = []

for work_met in work_met_list:
    data_work_met.append(Work_met_data(work_name=work_met.work_name, mtype=work_met.mtype, images=work_met.images))


In [11]:
df_work_met = pd.DataFrame(data=[asdict(work_met) for work_met in data_work_met])


## Identified unique types :

In [12]:
# Unique types :
display(df_work_met["mtype"].value_counts(dropna=False))


mtype
H5       45
H6       22
None     21
L6       21
LL5      13
H5-6      7
LL6       5
L5        2
L4        1
L3.8      1
CV3       1
LL4       1
L5-6      1
LL5-6     1
Name: count, dtype: int64

21 (len(extract_failures)) as none is to be expected. These have to be either fixed as request or manually. <br>
The rest of the types needs to be reduced into common classes or excluded to avoid lack of support in classification.

In [13]:
display(df_work_met[df_work_met["mtype"].isna()])


Unnamed: 0,work_name,mtype,images
1,G015,,[G015.jpg]
36,GB39,,[GB39.jpg]
37,GC002,,"[GC002-2.jpg, GC002-3.jpg, GC002.jpg]"
38,GC004,,"[GC004-2.jpg, GC004-3.jpg, GC004-4.jpg, GC004...."
39,GC005,,[GC005.jpg]
40,GC006,,"[GC006-T6-2.jpg, GC006-T6-3.jpg, GC006-T6-4.jp..."
41,GC007,,[GC007-T6.jpg]
42,GC010,,[GC010.jpg]
43,GC014b,,"[GC014b-T5-2.jpg, GC014b-T5.jpg, GC014b_Odd-PL..."
44,GC014c,,[GC014c_T5.jpg]


### Manual search :
- Missing from website : G015, GB39,  GC : [002, 004, 005, 006, 007, 010, 014 (/, b, c), 015, 019, 021, 024, 029 (/, T5), 067, ], MA294, MC176, RM0833

# Saving Dataset :

In [14]:
df_work_met.to_csv(path_or_buf="../data/work_met_img_type_1.csv", index=None)
df_work_met.to_pickle(path="../data/work_met_img_type_1.pkl")
