# How does the Data look like? 🗃
So, the data provided to us in this competition consists of 3 .csv files and 2 folders (training_images and testing_images).

Below is the breakdown of the .csv files and image folders;

* **📄 train.csv** - This is the Training set metadata. Each row contains the data for a single posting. Multiple postings might have the exact same image ID, but with different titles or vice versa.
* **📄 test.csv** - Same as train.csv except the label_group column. This file will be what we are going to use at inference time. Currently it only consists of 3 samples but it will be replaced by a bigger private test set at submission time.
* **📄 sample_submission.csv **- The Sample submission file in the format we are expected to follow.
* **📂 train_images/** - Folder with all the training images.
* **📂 test_images/ **- Folder with all the testing images (again, only 4 images for now, but will be around ~70,000 images during submission)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
import glob
import random
from wordcloud import WordCloud, STOPWORDS


In [None]:
train_file = pd.read_csv("../input/shopee-product-matching/train.csv")
train_file.head()

In [None]:
test_file = pd.read_csv("../input/shopee-product-matching/test.csv")
test_file.head()

In [None]:
# Let's find out how many images are under the directory
total_train_files = glob.glob("../input/shopee-product-matching/train_images/*.jpg")
total_test_files = glob.glob("../input/shopee-product-matching/test_images/*.jpg")

print(f"Total Training Images: {len(total_train_files)}")
print(f"Total Testing Images: {len(total_test_files)}")

In [None]:
#Let's figure out the unique number for each columns.
for col in train_file.columns:
    print(col + ":" + str(len(train_file[col].unique())))

In [None]:
train_file.info()

In [None]:
train_file.nunique().to_frame().rename(columns={0:"Unique Values"}).style.background_gradient(cmap="plasma")

In [None]:
# Check for missing values in the training data
train_file.isnull().sum()

In [None]:
#Image Label Groups by No. of Images
top10_names = train_file['label_group'].value_counts().index.tolist()[:15]
top10_values = train_file['label_group'].value_counts().tolist()[:15]

plt.figure(figsize=(20, 10))
sns.barplot(x=top10_names, y=top10_values)
plt.xticks(rotation=45)
plt.xlabel("Label Group")
plt.ylabel("Image Count")
plt.title("Top-15 Label Groups by Image Count")
plt.show()

In [None]:
stopwords = set(STOPWORDS) 
wordcloud = WordCloud(width = 1000, 
                      height = 500,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(train_file['title'])) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

In [None]:
def plot(num):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    sq_num = np.sqrt(num)
    assert sq_num == int(sq_num), "Number of Images must be a perfect Square!"

    sq_num = int(sq_num)
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))

    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(IMG_PATHS + '/' + image_ids[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img); ax[i, j].set_title(f'{image_ids[idx]}', fontsize=6.5)

    plt.show()
    
    
def plot_from_label(group):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    image_list = train_file[train_file['label_group'] == group]
    image_list = image_list['image'].tolist()
    num = len(image_list)
    
    sq_num = np.sqrt(num)

    sq_num = int(sq_num)
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))
    
    path = [os.path.join(IMG_PATHS, x) for x in image_list]
    
    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(path[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img)

    plt.show()

def plot_from_title(title):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    image_list = train_file[train_file['title'] == title]
    image_list = image_list['image'].tolist()
    num = len(image_list)
    
    sq_num = np.sqrt(num)
    sq_num = int(sq_num)
    
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))
    fig.suptitle(f"Product Name: {title}")
    path = [os.path.join(IMG_PATHS, x) for x in image_list]
    
    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(path[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img)
            
    plt.show()

In [None]:
plot(16)

In [None]:
plot_from_label(994676122)

In [None]:
plot_from_title("Koko syubbanul muslimin koko azzahir koko baju")

In [None]:
plot_from_title("Monde Boromon Cookies 1 tahun+ 120gr")