<div>
    <img src="https://i.imgur.com/kQdrSYV.png">
    </div>
    
<center><h1>Introduction 📝</h1></center>

> 🎯Goal: To build a model that predicts which items are the same products
>
> These matches can be performed automatically with the help of machine learning and that is the goal of this competition. We have been provided with data of **Shopee**, which is the leading e-commerce platform in Southeast Asia and Taiwan.

<center><h1>Diving into the Data 🤿 </h1></center>

> **train/test.csv** - Each row contains the data for a single posting. 
> 
> - posting_id : the ID code for the posting
> - image : the image id/md5sum
> - image_phash : a perceptual hash of the image
> - title : the product description for the posting
> - label_group : ID code for all postings that map to the same product. Not provided for the test set
> - matches - **Space delimited** list of all posting IDs that match a particular posting. 
> 
> 📌Posts always self-match. 
> 
> 📌**Group sizes were capped at 50**, so we need not predict more than 50 matches for a posting.

<center><h2>Evaluation metric:</h2> <b><h4>F1-score 🧪</h4></b> </center>

> The evaluation metric for this competition is F1-Score or F-Score.
> 
> <img src="https://www.gstatic.com/education/formulas2/355397047/en/f1_score.svg">
> 
>  It finds the balance between precision and recall.
>  <img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d37e557b5bfc8de22afa8aad1c187a357ac81bdb">
>  <img src="https://miro.medium.com/max/560/1*AEV3TE67ahMn3NVpU0ov4g.png" height=10>
>  
>  where-
>  - TP = True Positive
>  - FP = False Positive
>  - TN = True Negative
>  - FN = False Negative

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import cv2
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from colorama import Fore,Back,Style
!pip install wordcloud
from wordcloud import WordCloud,STOPWORDS


In [None]:
y_=Fore.YELLOW
r_=Fore.RED
g_=Fore.GREEN
b_=Fore.BLUE
m_=Fore.MAGENTA

In [None]:
Tr_df = pd.read_csv("../input/shopee-product-matching/train.csv")
Te_df = pd.read_csv("../input/shopee-product-matching/test.csv")

In [None]:
Tr_df.head()

In [None]:
Te_df.head()

In [None]:
Te_df

In [None]:
Tr_jpg ='../input/shopee-product-matching/train_images'
Te_jpg = '../input/shopee-product-matching/test_images'

In [None]:
def getImagePaths(path):
    image_names = []
    for dirname,_,filenames in os.walk(path):
        for filename in filenames:
            pathfull = os.path.join(dirname,filename)
            image_names.append(pathfull)
    return image_names        

In [None]:
Tr_img_path = getImagePaths(Tr_jpg)
Te_img_path = getImagePaths(Te_jpg)

In [None]:
print(f"{y_}Number of train images :  {g_} {len(Tr_img_path)}\n")
print(f"{y_}Number of test images :  {g_} {len(Te_img_path)}\n")

In [None]:
def getShape(images_paths):
    shape = cv2.imread(images_paths[0]).shape
    for image_path in images_paths:
        image_shape = cv2.imread(image_path).shape
        if (image_shape!=shape):
            return 'Different Images shape : '
        else:
            return 'Same Image shape : ' +str(shape)

In [None]:
getShape(Tr_img_path)

In [None]:
getShape(Te_img_path)

<center><h3>Displaying Images </h3></center>

In [None]:
def display_multi_img(images_paths,rows,cols):
    figure ,ax = plt.subplots(nrows=rows,ncols = cols,figsize=(17,9))
    for ind,image_path in enumerate(images_paths):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()


In [None]:
display_multi_img(Tr_img_path[0:25],5,5)

In [None]:
display_multi_img(Te_img_path,1,3)

In [None]:
def styling():
    for spine in plt.gca().spines.values():
        spine.set_visible(False)
        plt.xticks([])
        plt.yticks([])

In [None]:
def hist(image_path):
    plt.figure(figsize = (16,4))
    
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    plt.subplot(1,5,1)
    plt.imshow(img)
    styling()
    
    custom_colors= ["#ef233d",'#76dd71','#26677f','#aec3d0']
    labels = ['Red Channel','Green Channel','Blue Channel','Total']
    
    for i in range(1,3):
        plt.subplot(1,5,i+1)
        plt.hist(img[:,:,i-1].reshape(-1),bins=64,color = custom_colors[i-1],alpha = 0.6)
        plt.xlabel(labels[i-1],fontsize=10)
        styling()
        
    plt.subplot(1,5,5)
    plt.hist(img.reshape(-1),bins=128,color = custom_colors[3],alpha = 0.6)
    plt.xlabel(labels[3],fontsize=10)
    styling()
    plt.show()    

In [None]:
def display_hist(images_paths):
    for ind,image_path in enumerate(images_paths):
        if (ind<6):
            hist(image_path)

In [None]:
display_hist(Tr_img_path[5:10])

In [None]:
display_hist(Te_img_path)

In [None]:
Tr_df['label_group'].nunique()

In [None]:
tr_labels_count = Tr_df['label_group'].value_counts()
most_freq = tr_labels_count[tr_labels_count == tr_labels_count.max()]
less_freq = tr_labels_count[tr_labels_count == tr_labels_count.min()]

m_label = np.unique(Tr_df['label_group'][Tr_df['label_group'].isin(most_freq.index)].values)
l_label = np.unique(Tr_df['label_group'][Tr_df['label_group'].isin(less_freq.index)].values)

print(f"{m_}Most Frequent label group : ",m_label)
print(f"{y_}Most Frequent label group : ",l_label)


In [None]:
def path(group,m):
    PATH = "../input/shopee-product-matching/train_images"
    
    if m=='l':
        z = Tr_df['image'][Tr_df['label_group']==group].values
    if m=='t':
        z = Tr_df['image'][Tr_df['title']==group].values
   
    image_names = []
    for filename in z:
        fullpath = os.path.join(PATH, filename)
        image_names.append(fullpath)
    return image_names

In [None]:
display_multi_img(path(159351600,'l'),3,3)

In [None]:
display_multi_img(path(994676122,'l'),3,3)

In [None]:
display_multi_img(path(562358068,'l'),3,3)

In [None]:
display_multi_img(path(1141798720,'l'),3,3)

In [None]:
display_multi_img(path( 3113678103,'l'),3,3)

In [None]:
display_multi_img(path( 3627744656,'l'),3,3)

In [None]:
display_multi_img(path( 4293276364,'l'),1,2)

In [None]:
display_multi_img(path( 4292939171,'l'),1,2)

In [None]:
display_multi_img(path( 4292154092,'l'),1,2)

In [None]:
Tr_df.shape

In [None]:
Te_df.shape

In [None]:
Tr_df['title'].nunique()

In [None]:
t = Tr_df['title'].value_counts().sort_values(ascending=False).reset_index()
t.columns = ['title','count']
t

In [None]:
display_multi_img(path("Koko syubbanul muslimin koko azzahir koko baju",'t'),3,3)

In [None]:
display_multi_img(path("Viva Air Mawar",'t'),3,2)

In [None]:
display_multi_img(path("Emina Glossy Stain",'t'),3,2)

In [None]:
display_multi_img(path("Baju Koko Pria Gus Azmi Syubbanul Muslimin Kombinasi Hadroh Azzahir Hilw HO187 KEMEJA KOKO PRIA BAJU",'t'), 4, 2)

In [None]:
display_multi_img(path("Monde Boromon Cookies 1 tahun+ 120gr",'t'), 2, 3)

In [None]:
def color_w(word=None,font_size=None,position=None,orientation =None, font_path=None,random_state=None):
    h =40
    s = 100
    l = random_state.randint(30,70)
    return 'hsl({},{}%,{}%)'.format(h,s,l)
plt.subplots(figsize=(10,10))
ww = WordCloud(stopwords = STOPWORDS,background_color = 'white',contour_width = 2,contour_color ='blue',color_func = color_w,max_words=100,max_font_size = 256 ,random_state = 42)
ww.generate(' '.join(Tr_df['title']))
plt.imshow(ww,interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
stopwords = set(STOPWORDS) 
wordcloud = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(Tr_df['title'])) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show()

<center><h3>Plugging in RAPIDS </h3></center>
<img src ="https://i.imgur.com/qWulN0F.jpg" height = 40>

[Documentation](https://docs.rapids.ai/api/cuml/nightly/api.html#cuml.feature_extraction.text) 📖

In [None]:
STOPWORDS = nltk.corpus.stopwords.words('english')

filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>',
           '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\t','\n',"'",",",'~' , '—']

def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
    # filter punctuation and case conversion
    translation_table = {ord(char): ord(' ') for char in filters}
    input_strs = input_strs.str.translate(translation_table)
    input_strs = input_strs.str.lower()
        
    # remove stopwords
    input_strs =  input_strs.str.replace_tokens(STOPWORDS, ' ')
        
    # replace multiple spaces with single one and strip leading/trailing spaces
    input_strs = input_strs.str.normalize_spaces( )
    input_strs = input_strs.str.strip(' ')
    
    return input_strs

def preprocess_text_df(df, text_cols=['title'], **kwargs):
    for col in text_cols:
        df[col] = preprocess_text(df[col], **kwargs)
    return  df

In [None]:
for k in range(5):
    plt.figure(figsize=(20,3))
    plt.plot(np.arange(50),'o-',color='#f48c06')
    plt.title('Text Distance From Train Row %i to Other Train Rows'%k,fontsize=15, fontweight='bold',horizontalalignment='center',fontfamily='serif')
    plt.ylabel('Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.xlabel('Index Sorted by Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.show()
    
    print(Tr_df['title'])

In [None]:

for k in range(5):
    plt.figure(figsize=(20,3))
    plt.plot(np.arange(50),'o-',color='#f48c06')
    plt.title('Text Distance From Train Row %i to Other Train Rows'%k,fontsize=15, fontweight='bold',horizontalalignment='center',fontfamily='serif')
    plt.ylabel('Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.xlabel('Index Sorted by Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.show()
    
    print(Tr_df['label_group'])

In [None]:
for k in range(5):
    plt.figure(figsize=(20,3))
    plt.plot(np.arange(50),'o-',color='#f48c06')
    plt.title('Text Distance From Train Row %i to Other Train Rows'%k,fontsize=15, fontweight='bold',horizontalalignment='center',fontfamily='serif')
    plt.ylabel('Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.xlabel('Index Sorted by Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.show()
    
    print(Te_df['title'])

In [None]:

for k in range(5):
    plt.figure(figsize=(20,3))
    plt.plot(np.arange(50),'o-',color='#f48c06')
    plt.title('Text Distance From Train Row %i to Other Train Rows'%k,fontsize=15, fontweight='bold',horizontalalignment='center',fontfamily='serif')
    plt.ylabel('Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.xlabel('Index Sorted by Distance to Train Row %i'%k,fontsize=13, fontweight='bold',fontfamily='serif')
    plt.show()
    
    print(Te_df['posting_id'])

In [None]:
import seaborn as sns
top10_names = Tr_df['label_group'].value_counts().index.tolist()[:15]
top10_values = Tr_df['label_group'].value_counts().tolist()[:15]

plt.figure(figsize=(20, 10))
sns.barplot(x=top10_names, y=top10_values)
plt.xticks(rotation=45)
plt.xlabel("Label Group")
plt.ylabel("Image Count")
plt.title("Top-15 Label Groups by Image Count")
plt.show()

In [None]:
sample = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')

In [None]:
sample

In [None]:
check = Te_df.groupby(['title']).count().reset_index()['title'].tolist()
a = []
b = []
for item in check:
    res = Te_df[Te_df['title']==item]['posting_id'].tolist()
    ans = ""
    for id_item in res:
        ans = ans + str(id_item)+""
    for id_item in res:
        a.append(id_item)
        b.append(ans)

In [None]:
submission1 = pd.DataFrame()
submission1['posting_id'] = a
submission1['matches'] = b
submission1

In [None]:
check = Te_df.groupby(['title']).count().reset_index()['title'].tolist()
a = []
b = []
for item in check:
    res = Te_df[Te_df['title']==item]['posting_id'].tolist()
    ans = ""
    for id_item in res:
        ans = ans + str(id_item)+""
    for id_item in res:
        a.append(id_item)
        b.append(ans)

In [None]:
submission2 = pd.DataFrame()
submission2['posting_id'] = a
submission2['matches'] = b
submission2

In [None]:
sub = pd.merge(submission1,submission2,on='posting_id',how='inner')
sub['list' ] = sub['matches_x'] + sub['matches_y']

In [None]:
sub

In [None]:
final = []
for index , row in sub.iterrows():
    res = list(set(row['list'].split(' ')))
    ans = ""
    for item in res:
        ans = ans + str(item) + " "
    ans = ans[:-1]
    final.append(ans)
    
submission = pd.DataFrame()
submission['posting_id'] = sub['posting_id']
submission['matches'] = final

In [None]:
submission.to_csv('submission.csv',index=False)