In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Constants
Set of variables I use throughout the notebook. Not sure if these cause submissions to fail.

In [None]:
SHOPEE_ROOT = "/kaggle/input/shopee-product-matching"
SHOPEE_TRAIN_IMAGES = SHOPEE_ROOT+"/"+"train_images"
RANDOM_STATE = 42


In [None]:
#seeding numpy randomizer
np.random.seed(RANDOM_STATE)


# Shopee Dataset EDA
Doing some exploratory data analysis. WIll make some hypethesis and then generate data to either prove or disprove hypothesis.

## Data Cleaning
It's not a good idea to just straightaway use the data set to train and test a model.
Datasets can be dirty:

* They could can contain duplicate entries
* They could contain entries that are invalid

So I will look for and remove any two entries that have the same `posting_id`

In [None]:
shopee_train_df = pd.read_csv(SHOPEE_ROOT+"/"+"train.csv")

In [None]:
shopee_train_df.shape

In [None]:
#dropping entries with identical posting_id values
shopee_train_df.drop_duplicates(["posting_id"], inplace=True)

In [None]:
shopee_train_df.shape

Nothing dropped

Any entries that have the same `(image, image_phash,title)` tuple can be for the purposes of creating a training set considered redundant(?)

In [None]:
shopee_train_df.drop_duplicates(["image", "image_phash", "title"], inplace=True)

In [None]:
shopee_train_df.shape

Nothing dropped yet. Data doesn't have redundante entries ☺️

In [None]:
#just taking a look
shopee_train_df

How many unique `label_group` exist in `train.csv`?

In [None]:
print("Number unique label_groups = {}".format( len(shopee_train_df["label_group"].unique()) ))

How many images are there per label group?

In [None]:
hash_of_label_group_images = {}
hash_of_image_count_per_label = {}

for mylabel in shopee_train_df["label_group"]:
    hash_of_label_group_images[mylabel]  = shopee_train_df[ shopee_train_df["label_group"]==mylabel]
    hash_of_image_count_per_label[mylabel] =  len(shopee_train_df[ shopee_train_df["label_group"]==mylabel])
    #print("for label_group = {} number of images is {}".format(mylabel,len(hash_of_label_group_images)))
    

I leave the line 7 commented out. You can comment it out if you want to see the very verbose printout

Which are the top 10 most populus and least populus `label_groups`?

In [None]:
#sorting the keys (the label_groups) by the values (thenumber of images per label_group)
sorted_label_groups = sorted(hash_of_image_count_per_label,key=hash_of_image_count_per_label.__getitem__)

In [None]:
#10 least populated (by distinct image count) label groups
label_group_image_counts = []
for i,my_label in enumerate(sorted_label_groups):
    if i > 9:
        break
    label_group_image_counts.append(hash_of_image_count_per_label[my_label])
    print("label {} has {} images".format(my_label,hash_of_image_count_per_label[my_label] ))

In [None]:
#10 most populated (by distinct image count) label groups
for i in range(len(sorted_label_groups)-11,len(sorted_label_groups)-1,1):
    label_group_image_counts.append(hash_of_image_count_per_label[ sorted_label_groups[i]  ])
    print("label {} has {} images".format(sorted_label_groups[i],hash_of_image_count_per_label[sorted_label_groups[i]  ] ))

In [None]:
#plotting the whole data set is ugly
#will plot the first 20 label groups: The label_groups with the 10 lowest and 10 highest number of images per said label group
label_groups_to_plot = sorted_label_groups[:10]
label_groups_to_plot.extend(sorted_label_groups[-11:-1])
label_groups_to_plot = [str(mylabel) for mylabel in label_groups_to_plot]
print(label_groups_to_plot)

In [None]:
plt.figure(figsize=(40,40))
plt.grid()
plt.barh(label_groups_to_plot, np.log10(label_group_image_counts))
plt.ylabel("label_group",fontsize=30)
plt.xlabel("log(number_of_images_per_label_group)",fontsize=30)
plt.title("The 10 Most and Least Populated label_groups and their Distinct Image Counts",fontsize=35)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)

plt.show()

There is a little over an order of magnitude difference in the number of images in the largest `label_group` when compared to the smallest `label_group`

Now I'm going to try to validate or better understand the data set by making some hypotheses and then examine the data set to prove/disprove them. These are not 100% conclusive findings but I feel give me an OK idea of the nature of the data set. If I am lucky I may find mislabeled data which will be helpful to know before I start doing long training and cross validation runs. I'm open to suggestions on more comprehensive ways to validate a data set. I image for data sets where the relationship between features follows some rule, a checker can be coded up to read each row. In this case I can't think of a rule-based checker so I'm hoping to luck out by randomly picking rows that have certain relationships and hoping to _see_ images that don't fit with my expectation.

Hypothesis: All images belonging to the same `label_group` are of the same object

In [None]:
#first just pick a random set of label_groups
label_groups = np.array(sorted_label_groups)
np.random.shuffle(label_groups)
label_group_subset = label_groups[:5]
print(label_group_subset)

In [None]:
fig, ax = plt.subplots(nrows=5,ncols=2,figsize=(30,30))
for i, mylabel in enumerate(label_group_subset):
    image_names =  shopee_train_df[ shopee_train_df["label_group"] == mylabel ]["image"]
    image_titles = shopee_train_df[ shopee_train_df["label_group"] == mylabel ]["title"]
    
    for j, image_name in enumerate(image_names):
        if j>=2:
            break
        img = plt.imread(SHOPEE_TRAIN_IMAGES+"/"+image_name)
        ax[i,j].set_title(str(image_titles.iloc[j])+"\nlg: "+str(mylabel))
        ax[i,j].imshow(img)


**Finding**: Looks like the objects in the images that belong to the same `label_group` are the same. We can trust the labeling.

Hypothesis all images with the same phash are the same object

In [None]:
unique_phash = shopee_train_df["image_phash"].unique()
print("number of unique_phashes is {}".format(len(unique_phash)))

In [None]:
#number of phashes to randomly select
NUM_OF_PHASHES = 10

unique_phash = np.array(unique_phash)
np.random.shuffle(unique_phash)
unique_phash_subset = []
i = 0

for myphash in unique_phash:
    numb_images = len(shopee_train_df[shopee_train_df["image_phash"] == myphash]["image"])
    #print("phash {} has {} images".format(myphash,numb_images))
    if numb_images > 1:
        i += 1
        unique_phash_subset.append(myphash)
        
    if i >= NUM_OF_PHASHES:
        break
        
print("unique_phash_subset = {}".format(unique_phash_subset))

In [None]:
fig, ax = plt.subplots(nrows=NUM_OF_PHASHES,ncols=2,figsize=(20,80))


for i,myphash in enumerate( unique_phash_subset):
    myimages = shopee_train_df[shopee_train_df["image_phash"] == myphash]["image"]
    mytitles = shopee_train_df[shopee_train_df["image_phash"] == myphash]["title"]
    #print("number of images at phash = {} is {}".format(len(myimages),myphash))
  
    for j, myimage in enumerate(myimages):
        if j >= 2:
            break
        ax[i,j].set_title(mytitles.iloc[j]+"\n phash: "+ str(myphash)+"\n image: "+str(myimage),fontsize=10)
        img = plt.imread(SHOPEE_TRAIN_IMAGES+"/"+myimage)
        ax[i,j].imshow(img)
        

**Findings**: Images that have the same phash are images of the same object. Seems that image files can have different `image` file names, but yet are the same image (AFAICT). I notice that people submit the same thing but call it something different (`titles` are different). I also notice that two different images can be different (slighlty) but have the same phash (see "MUKENA DEWASA" product images). 
🤔

Hypothesis: Not all images that belong to the same `label_group` have identical `image_phash` values

In [None]:
fig, ax = plt.subplots(nrows=5,ncols=2,figsize=(40,40))
#ax = ax.flatten()

for i, mylabel in enumerate(label_group_subset):
    image_names =  shopee_train_df[ shopee_train_df["label_group"] == mylabel ]["image"]
    image_phashes = shopee_train_df[ shopee_train_df["label_group"] == mylabel ]["image_phash"]
    image_titles = shopee_train_df[ shopee_train_df["label_group"] == mylabel ]["title"]

    for j,myphash1 in enumerate(image_phashes):
        for k, myphash2 in enumerate(image_phashes):
            if myphash1 != myphash2:
                img1 = plt.imread(SHOPEE_TRAIN_IMAGES + "/" + image_names.iloc[j])
                ax[i,0].set_title(str(image_titles.iloc[j]+"\nphash: "+str(myphash1) + "\nlg: "+str(mylabel)))
                ax[i,0].imshow(img1)
                img2 = plt.imread(SHOPEE_TRAIN_IMAGES + "/" + image_names.iloc[k])
                ax[i,1].set_title(str(image_titles.iloc[k]+"\nphash: "+str(myphash2) + "\nlg: "+str(mylabel)))
                ax[i,1].imshow(img2)

**Finding**: Yes there are many images whose `image_phash` are not identical yet these images belong to the same  `label_group`. So sometimes people do submit different pictures of the same product (presumably).

# Conclusions

* There are no duplicates (either by `posting_id` or `(image, image_phash,title)`
* There is pretty large difference in number of distinct images between the most and least populus `label_group`
* All `label_groups` have at least 2 images. I think this is good for training a CNN.
* Images that belong to the same `label_group` so far appear to be images of the same object. So currently no concern of mislabled entries in `train.csv`. But this is not conclusive
* Seems there are copies of the same image under in `train.csv` that have different `image` names. 