Many thanks to Chris Deotte @cdeotte for his great notebooks!!

In this notebook, I try to find the effect preprocessing of text has on the F1 score (considering only the title). I have tried:
1. Using the title without any pre-processing
2. Pre-process by removing punctuations, numbers and special characters
3. Removing stopwords of the language used in the title
4. Remove stopwords and numbers/special characters

__Shopee is the leading e-commerce platform in Southeast Asia and Taiwan.__

# Competition Goal

__In this competition, you’ll apply your machine learning skills to build a model that predicts which items are the same products.__

# Evaluation Metric

__Submissions will be evaluated based on their mean F1 score.__

# Code Requirements

Submissions to this competition must be made through Notebooks. In order for the "Submit" button to be active after a commit, the following conditions must be met:

- CPU Notebook <= 9
- GPU Notebook <= 2
- Internet access disabled
- Freely & publicly available external data is allowed, including pre-trained models
- Submission file must be named "submission.csv"

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

import cudf, cuml, cupy
from textwrap import wrap

import gc

import itertools
import collections
from collections import Counter

import re
from wordcloud import WordCloud

import os
print(os.listdir('/kaggle/input/shopee-product-matching/'))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

import warnings
warnings.simplefilter('ignore')

In [None]:
base_dir = '/kaggle/input/shopee-product-matching/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
print(f'Number of train images: {len(os.listdir(base_dir + "train_images/"))}')
print(f'Number of test images: {len(os.listdir(base_dir + "test_images/"))}')

In [None]:
train['image_path'] = base_dir + 'train_images/' + train['image']
test['image_path'] = base_dir + 'test_images/' + test['image']

__Baseline F1 score using Image Phash provided__

In [None]:
temp = train.groupby('label_group')['posting_id'].agg('unique').to_dict()
train['target'] = train['label_group'].map(temp)
train.head(2)

In [None]:
def get_f1score(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score

In [None]:
#To calculate F1 score - local
temp = train.groupby('image_phash')['posting_id'].agg('unique').to_dict()
train['oof'] = train['image_phash'].map(temp)

In [None]:
train['f1_base'] = train.apply(get_f1score('oof'), axis = 1)
print(f"Train F1 Score: {round(train['f1_base'].mean(), 3)}")

In [None]:
#For submission test set will be replaced with bigger(70k) dataset
if len(test) == 3:
    df = train
    img_dir = '../input/shopee-product-matching/train_images/'
    df_text = df[['title']]
    print(df.shape)
else:
    df = test
    img_dir = '../input/shopee-product-matching/test_images/'
    df_text = df[['title']]
    print(df.shape)

# Find similar images using Text (title) embeddings

# 1. F1 Metric using Clean Text
- Clean text of numbers, special characters, punctuations

In [None]:
import string

def preprocess(x):
    try:
        x = x.lower() #lower case
        x = x.strip() #white space
        x = x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) #remove punctuations
        x = re.sub(r'[^a-z]', ' ', x) #Remove numbers and special characters
    except:
        None
    return x

In [None]:
df_text['title_clean'] = df_text['title'].apply(lambda x: preprocess(x))
#Convert to cudf to speed up
title_text = cudf.DataFrame.from_pandas(df_text)['title_clean']
title_text.shape

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 25000)

text_embeddings = tfid.fit_transform(title_text).toarray()
print(f"Title Text Embeddings shape: {text_embeddings.shape}")

__Finding similar titles with Cosine Similarity__

In [None]:
%%time
preds = []
chunk_size = 4096

chunk_it = np.arange(np.ceil(len(df) / chunk_size))

for j in chunk_it: 
    a = int(j * chunk_size)
    b = int((j + 1) * chunk_size)
    b = min(b, len(df))
    print('Processing chunk', a, 'to', b)
    sim = cupy.matmul(text_embeddings, text_embeddings[a: b].T).T
    for k in range(b - a):
        idx = cupy.where(sim[k, ] > 0.75)[0]
        post_ids = df.iloc[cupy.asnumpy(idx)]['posting_id'].values
        preds.append(post_ids)

In [None]:
df['preds_txt1'] = preds
df.head(2)

In [None]:
print(f"F1 Score for Text  : {round(df.apply(get_f1score('preds_txt1'), axis = 1).mean(), 3)}")

# 2. F1 Metric using Clean Text with numbers/spl chars
- Remove punctuations, lower case and strip white space

In [None]:
import string

def preprocess(x):
    try:
        x = x.lower() #lower case
        x = x.strip() #white space
        x = x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) #remove punctuations
        #x = re.sub(r'[^a-z]', ' ', x) #Remove numbers and special characters
    except:
        None
    return x

In [None]:
df_text['title_clean'] = df_text['title'].apply(lambda x: preprocess(x))
#Convert to cudf to speed up
title_text = cudf.DataFrame.from_pandas(df_text)['title_clean']
title_text.shape

In [None]:
tfid = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 25000)

text_embeddings = tfid.fit_transform(title_text).toarray()
print(f"Title Text Embeddings shape: {text_embeddings.shape}")

In [None]:
%%time
preds = []
chunk_size = 4096

chunk_it = np.arange(np.ceil(len(df) / chunk_size))

for j in chunk_it: 
    a = int(j * chunk_size)
    b = int((j + 1) * chunk_size)
    b = min(b, len(df))
    print('Processing chunk', a, 'to', b)
    sim = cupy.matmul(text_embeddings, text_embeddings[a: b].T).T
    for k in range(b - a):
        idx = cupy.where(sim[k, ] > 0.75)[0]
        post_ids = df.iloc[cupy.asnumpy(idx)]['posting_id'].values
        preds.append(post_ids)

del text_embeddings
gc.collect()

In [None]:
df['preds_txt2'] = preds
print(f"F1 Score for Text  : {round(df.apply(get_f1score('preds_txt2'), axis = 1).mean(), 3)}")

# 3. F1 Metric using custom Stopwords with numbers/spl chars
- Remove punctuations and remove malay stopwords

In [None]:
#from https://github.com/stopwords-iso/stopwords-id
indo_stopwords = [
                  "ada","adalah","adanya","adapun","agak","agaknya","agar","akan","akankah","akhir","akhiri",
                  "akhirnya","aku","akulah","amat","amatlah","anda","andalah","antar","antara","antaranya","apa",
                  "apaan","apabila","apakah","apalagi","apatah","artinya","asal","asalkan","atas","atau","ataukah",
                  "ataupun","awal","awalnya","bagai","bagaikan","bagaimana","bagaimanakah","bagaimanapun","bagi",
                  "bagian","bahkan","bahwa","bahwasanya","baik","bakal","bakalan","balik","banyak","bapak","baru",
                  "bawah","beberapa","begini","beginian","beginikah","beginilah","begitu","begitukah","begitulah",
                  "begitupun","bekerja","belakang","belakangan","belum","belumlah","benar","benarkah","benarlah",
                  "berada","berakhir","berakhirlah","berakhirnya","berapa","berapakah","berapalah","berapapun",
                  "berarti","berawal","berbagai","berdatangan","beri","berikan","berikut","berikutnya","berjumlah",
                  "berkali-kali","berkata","berkehendak","berkeinginan","berkenaan","berlainan","berlalu",
                  "berlangsung","berlebihan","bermacam","bermacam-macam","bermaksud","bermula","bersama",
                  "bersama-sama","bersiap","bersiap-siap","bertanya","bertanya-tanya","berturut","berturut-turut",
                  "bertutur","berujar","berupa","besar","betul","betulkah","biasa","biasanya","bila","bilakah",
                  "bisa","bisakah","boleh","bolehkah","bolehlah","buat","bukan","bukankah","bukanlah","bukannya",
                  "bulan","bung","cara","caranya","cukup","cukupkah","cukuplah","cuma","dahulu","dalam","dan","dapat",
                  "dari","daripada","datang","dekat","demi","demikian","demikianlah","dengan","depan","di","dia",
                  "diakhiri","diakhirinya","dialah","diantara","diantaranya","diberi","diberikan","diberikannya",
                  "dibuat","dibuatnya","didapat","didatangkan","digunakan","diibaratkan","diibaratkannya","diingat",
                  "diingatkan","diinginkan","dijawab","dijelaskan","dijelaskannya","dikarenakan","dikatakan",
                  "dikatakannya","dikerjakan","diketahui","diketahuinya","dikira","dilakukan","dilalui","dilihat",
                  "dimaksud","dimaksudkan","dimaksudkannya","dimaksudnya","diminta","dimintai","dimisalkan","dimulai",
                  "dimulailah","dimulainya","dimungkinkan","dini","dipastikan","diperbuat","diperbuatnya",
                  "dipergunakan","diperkirakan","diperlihatkan","diperlukan","diperlukannya","dipersoalkan",
                  "dipertanyakan","dipunyai","diri","dirinya","disampaikan","disebut","disebutkan","disebutkannya",
                  "disini","disinilah","ditambahkan","ditandaskan","ditanya","ditanyai","ditanyakan","ditegaskan",
                  "ditujukan","ditunjuk","ditunjuki","ditunjukkan","ditunjukkannya","ditunjuknya","dituturkan",
                  "dituturkannya","diucapkan","diucapkannya","diungkapkan","dong","dua","dulu","empat","enggak",
                  "enggaknya","entah","entahlah","guna","gunakan","hal","hampir","hanya","hanyalah","hari","harus",
                  "haruslah","harusnya","hendak","hendaklah","hendaknya","hingga","ia","ialah","ibarat","ibaratkan",
                  "ibaratnya","ibu","ikut","ingat","ingat-ingat","ingin","inginkah","inginkan","ini","inikah","inilah",
                  "itu","itukah","itulah","jadi","jadilah","jadinya","jangan","jangankan","janganlah","jauh","jawab",
                  "jawaban","jawabnya","jelas","jelaskan","jelaslah","jelasnya","jika","jikalau","juga","jumlah",
                  "jumlahnya","justru","kala","kalau","kalaulah","kalaupun","kalian","kami","kamilah","kamu",
                  "kamulah","kan","kapan","kapankah","kapanpun","karena","karenanya","kasus","kata","katakan",
                  "katakanlah","katanya","ke","keadaan","kebetulan","kecil","kedua","keduanya","keinginan",
                  "kelamaan","kelihatan","kelihatannya","kelima","keluar","kembali","kemudian","kemungkinan",
                  "kemungkinannya","kenapa","kepada","kepadanya","kesampaian","keseluruhan","keseluruhannya",
                  "keterlaluan","ketika","khususnya","kini","kinilah","kira","kira-kira","kiranya","kita","kitalah",
                  "kok","kurang","lagi","lagian","lah","lain","lainnya","lalu","lama","lamanya","lanjut","lanjutnya",
                  "lebih","lewat","lima","luar","macam","maka","makanya","makin","malah","malahan","mampu","mampukah",
                  "mana","manakala","manalagi","masa","masalah","masalahnya","masih","masihkah","masing",
                  "masing-masing","mau","maupun","melainkan","melakukan","melalui","melihat","melihatnya","memang",
                  "memastikan","memberi","memberikan","membuat","memerlukan","memihak","meminta","memintakan",
                  "memisalkan","memperbuat","mempergunakan","memperkirakan","memperlihatkan","mempersiapkan",
                  "mempersoalkan","mempertanyakan","mempunyai","memulai","memungkinkan","menaiki","menambahkan",
                  "menandaskan","menanti","menanti-nanti","menantikan","menanya","menanyai","menanyakan","mendapat",
                  "mendapatkan","mendatang","mendatangi","mendatangkan","menegaskan","mengakhiri","mengapa",
                  "mengatakan","mengatakannya","mengenai","mengerjakan","mengetahui","menggunakan","menghendaki",
                  "mengibaratkan","mengibaratkannya","mengingat","mengingatkan","menginginkan","mengira","mengucapkan",
                  "mengucapkannya","mengungkapkan","menjadi","menjawab","menjelaskan","menuju","menunjuk","menunjuki",
                  "menunjukkan","menunjuknya","menurut","menuturkan","menyampaikan","menyangkut","menyatakan",
                  "menyebutkan","menyeluruh","menyiapkan","merasa","mereka","merekalah","merupakan","meski","meskipun",
                  "meyakini","meyakinkan","minta","mirip","misal","misalkan","misalnya","mula","mulai","mulailah",
                  "mulanya","mungkin","mungkinkah","nah","naik","namun","nanti","nantinya","nyaris","nyatanya","oleh",
                  "olehnya","pada","padahal","padanya","pak","paling","panjang","pantas","para","pasti","pastilah",
                  "penting","pentingnya","per","percuma","perlu","perlukah","perlunya","pernah","persoalan","pertama",
                  "pertama-tama","pertanyaan","pertanyakan","pihak","pihaknya","pukul","pula","pun","punya","rasa",
                  "rasanya","rata","rupanya","saat","saatnya","saja","sajalah","saling","sama","sama-sama",
                  "sambil","sampai","sampai-sampai","sampaikan","sana","sangat","sangatlah","satu","saya",
                  "sayalah","se","sebab","sebabnya","sebagai","sebagaimana","sebagainya","sebagian","sebaik",
                  "sebaik-baiknya","sebaiknya","sebaliknya","sebanyak","sebegini","sebegitu","sebelum",
                  "sebelumnya","sebenarnya","seberapa","sebesar","sebetulnya","sebisanya","sebuah","sebut",
                  "sebutlah","sebutnya","secara","secukupnya","sedang","sedangkan","sedemikian","sedikit",
                  "sedikitnya","seenaknya","segala","segalanya","segera","seharusnya","sehingga","seingat",
                  "sejak","sejauh","sejenak","sejumlah","sekadar","sekadarnya","sekali","sekali-kali","sekalian",
                  "sekaligus","sekalipun","sekarang","sekecil","seketika","sekiranya","sekitar","sekitarnya",
                  "sekurang-kurangnya","sekurangnya","sela","selagi","selain","selaku","selalu","selama",
                  "selama-lamanya","selamanya","selanjutnya","seluruh","seluruhnya","semacam","semakin",
                  "semampu","semampunya","semasa","semasih","semata","semata-mata","semaunya","sementara",
                  "semisal","semisalnya","sempat","semua","semuanya","semula","sendiri","sendirian","sendirinya",
                  "seolah","seolah-olah","seorang","sepanjang","sepantasnya","sepantasnyalah","seperlunya",
                  "seperti","sepertinya","sepihak","sering","seringnya","serta","serupa","sesaat","sesama",
                  "sesampai","sesegera","sesekali","seseorang","sesuatu","sesuatunya","sesudah","sesudahnya",
                  "setelah","setempat","setengah","seterusnya","setiap","setiba","setibanya","setidak-tidaknya",
                  "setidaknya","setinggi","seusai","sewaktu","siap","siapa","siapakah","siapapun","sini","sinilah",
                  "soal","soalnya","suatu","sudah","sudahkah","sudahlah","supaya","tadi","tadinya","tahu","tahun",
                  "tak","tambah","tambahnya","tampak","tampaknya","tandas","tandasnya","tanpa","tanya","tanyakan",
                  "tanyanya","tapi","tegas","tegasnya","telah","tempat","tengah","tentang","tentu","tentulah",
                  "tentunya","tepat","terakhir","terasa","terbanyak","terdahulu","terdapat","terdiri","terhadap",
                  "terhadapnya","teringat","teringat-ingat","terjadi","terjadilah","terjadinya","terkira",
                  "terlalu","terlebih","terlihat","termasuk","ternyata","tersampaikan","tersebut","tersebutlah",
                  "tertentu","tertuju","terus","terutama","tetap","tetapi","tiap","tiba","tiba-tiba","tidak",
                  "tidakkah","tidaklah","tiga","tinggi","toh","tunjuk","turut","tutur","tuturnya","ucap","ucapnya",
                  "ujar","ujarnya","umum","umumnya","ungkap","ungkapnya","untuk","usah","usai","waduh","wah","wahai",
                  "waktu","waktunya","walau","walaupun","wong","yaitu","yakin","yakni","yang"
                 ]

In [None]:
def preprocess(x):
    try:
        x = x.lower() #lower case
        x = x.strip() #white space
        x = x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) #remove punctuations
        x = ' '.join([word for word in x.split() if word not in indo_stopwords]) #stopwords
        #x = re.sub(r'[^a-z]', ' ', x) #Remove numbers and special characters
    except:
        None
    return x

In [None]:
df_text['title_clean'] = df_text['title'].apply(lambda x: preprocess(x))
#Convert to cudf to speed up
title_text = cudf.DataFrame.from_pandas(df_text)['title_clean']
title_text.shape

In [None]:
tfid = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 25000)

text_embeddings = tfid.fit_transform(title_text).toarray()
print(f"Title Text Embeddings shape: {text_embeddings.shape}")

In [None]:
%%time
preds = []
chunk_size = 4096

chunk_it = np.arange(np.ceil(len(df) / chunk_size))

for j in chunk_it: 
    a = int(j * chunk_size)
    b = int((j + 1) * chunk_size)
    b = min(b, len(df))
    print('Processing chunk', a, 'to', b)
    sim = cupy.matmul(text_embeddings, text_embeddings[a: b].T).T
    for k in range(b - a):
        idx = cupy.where(sim[k, ] > 0.75)[0]
        post_ids = df.iloc[cupy.asnumpy(idx)]['posting_id'].values
        preds.append(post_ids)

del text_embeddings
gc.collect()

In [None]:
df['preds_txt3'] = preds
print(f"F1 Score for Text  : {round(df.apply(get_f1score('preds_txt3'), axis = 1).mean(), 3)}")

# 4. F1 Metric using custom Stopwords
- Remove punctuations numbers, special chars and indonesian stopwords

In [None]:
def preprocess(x):
    try:
        x = x.lower() #lower case
        x = x.strip() #white space
        x = x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) #remove punctuations
        x = ' '.join([word for word in x.split() if word not in indo_stopwords]) #stopwords
        x = re.sub(r'[^a-z]', ' ', x) #Remove numbers and special characters
    except:
        None
    return x

In [None]:
df_text['title_clean'] = df_text['title'].apply(lambda x: preprocess(x))
#Convert to cudf to speed up
title_text = cudf.DataFrame.from_pandas(df_text)['title_clean']
title_text.shape

In [None]:
tfid = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 25000)

text_embeddings = tfid.fit_transform(title_text).toarray()
print(f"Title Text Embeddings shape: {text_embeddings.shape}")

preds = []
chunk_size = 4096

chunk_it = np.arange(np.ceil(len(df) / chunk_size))

for j in chunk_it: 
    a = int(j * chunk_size)
    b = int((j + 1) * chunk_size)
    b = min(b, len(df))
    print('Processing chunk', a, 'to', b)
    sim = cupy.matmul(text_embeddings, text_embeddings[a: b].T).T
    for k in range(b - a):
        idx = cupy.where(sim[k, ] > 0.75)[0]
        post_ids = df.iloc[cupy.asnumpy(idx)]['posting_id'].values
        preds.append(post_ids)

del text_embeddings
gc.collect()

In [None]:
df['preds_txt4'] = preds
print(f"F1 Score for Text  : {round(df.apply(get_f1score('preds_txt4'), axis = 1).mean(), 3)}")

# 5. F1 Metric title
- No change

In [None]:
#Convert to cudf to speed up
title_text = cudf.DataFrame.from_pandas(df_text)['title']
title_text.shape

In [None]:
tfid = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 25000)

text_embeddings = tfid.fit_transform(title_text).toarray()
print(f"Title Text Embeddings shape: {text_embeddings.shape}")

preds = []
chunk_size = 4096

chunk_it = np.arange(np.ceil(len(df) / chunk_size))

for j in chunk_it: 
    a = int(j * chunk_size)
    b = int((j + 1) * chunk_size)
    b = min(b, len(df))
    print('Processing chunk', a, 'to', b)
    sim = cupy.matmul(text_embeddings, text_embeddings[a: b].T).T
    for k in range(b - a):
        idx = cupy.where(sim[k, ] > 0.75)[0]
        post_ids = df.iloc[cupy.asnumpy(idx)]['posting_id'].values
        preds.append(post_ids)

del text_embeddings
gc.collect()

In [None]:
df['preds_txt5'] = preds
print(f"F1 Score for Text  : {round(df.apply(get_f1score('preds_txt5'), axis = 1).mean(), 3)}")

# Conclusion
- Removing numbers and special characters results in higher F1 score (only Title)
- Using custom stop words has a little effect

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))