In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys

from matplotlib import pyplot as plt
from dotenv import load_dotenv

sys.path.append("../")

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [2]:
mtypes_path = "../data/work_met_img_type_1.pkl"

df = pd.read_pickle(filepath_or_buffer=mtypes_path)


In [3]:
df["mtype"].value_counts(dropna=False)


mtype
H5       84
L6       42
H6       34
LL5      21
None     18
H5-6     10
LL6       8
L5        7
LL5-6     1
L5-6      1
CV3       1
L4        1
L3.8      1
LL4       1
Name: count, dtype: int64

# Purpose :
- Removing the classes that have a support too low for the model to gain enough insight on it.

In [4]:
df[df["mtype"].isna()]


Unnamed: 0,work_name,mtype,images
57,GC029,,"[GC029_T5-2.jpg, GC029_T5.jpg, GC029-T5.jpg]"
71,GC020,,[GC020-TROILITE.jpg]
81,GC234,,"[GC234-3.jpg, GC234-.jpg, GC234.jpg, GC234-chr..."
87,GC015,,"[GC015-3.jpg, GC015-2.jpg]"
88,GC023,,"[GC023-odd-CHONDRULES.jpg, GC023.jpg, GC023-2...."
90,GC021,,"[GC021_T5.jpg, GC021_T5-2.jpg]"
108,GC067,,"[GC067_T5-2.jpg, GC067_T5.jpg]"
124,GC014b,,"[GC014b-T5-2.jpg, GC014b_Odd-PLAGIO-rim.jpg, G..."
125,G209,,[G209.jpg]
133,GC019,,"[GC019.jpg, GC019-T5.jpg]"


## Manually inputing the correct type :

- RMd : GB39, GC002, GC005, GC004, MB154, GC014c, MA294, GC026
- MV : GC0942 to GC094-2

---------

- H5 : GC020, GC234, GC023, MC176, GC010, G209, GC014b, GC021
- H6 : GC024,
- LL : 
- L5 : GC029, GC019
- L6 : GC015, MV064b, MV064a, G015,

- H5-6 : RM0833, RM0182

In [5]:
mtype_h5 = ["GC020", "GC234", "GC023", "MC176", "GC010", "G209", "GC014b", "GC021"]
mtype_h6 = ["GC024"]
mtype_l5 = ["GC029", "GC019"]
mtype_l6 = ["GC015", "MV064b", "MV064a", "G015"]
mtype_h56 = ["RM0833", "RM0182"]


In [7]:
for work_met in mtype_h5:
    df.loc[df["work_name"] == work_met, "mtype"] = "H5"

for work_met in mtype_h6:
    df.loc[df["work_name"] == work_met, "mtype"] = "H6"

for work_met in mtype_l5:
    df.loc[df["work_name"] == work_met, "mtype"] = "L5"

for work_met in mtype_l6:
    df.loc[df["work_name"] == work_met, "mtype"] = "L6"

for work_met in mtype_h56:
    df.loc[df["work_name"] == work_met, "mtype"] = "H5-6"


In [8]:
df["mtype"].value_counts(dropna=False)


mtype
H5       92
L6       46
H6       35
LL5      21
H5-6     12
L5        9
LL6       8
LL5-6     1
L5-6      1
CV3       1
L4        1
None      1
L3.8      1
LL4       1
Name: count, dtype: int64

In [16]:
def get_image_count(df):
    df_exploded = df.explode("images")
    
    image_counts = df_exploded["mtype"].value_counts()
    
    return image_counts


In [17]:
get_image_count(df)


mtype
H5       254
L6       164
H6       127
LL5       50
H5-6      32
LL6       30
L5        27
L5-6       4
L4         4
LL5-6      3
L3.8       3
CV3        2
LL4        2
Name: count, dtype: int64

## Conclusion : 
- We can remove the classes that have a very low support (below 27 @ l5), and run augments on classes that have less than 100 images, augment just a bit the H6 and L6 class and leave H5 alone, the goal is to have a uniform dataset with enough data on minority classes to be able to generalize
- Let's save the updated dataset :

In [18]:
df.to_csv(path_or_buf="../data/work_met_img_type_2.csv", index=None)
df.to_pickle(path="../data/work_met_img_type_2.pkl")
