# About this Notebook
This is a first run through the compeition to try and understand the datatset and realise the problem at hand.

In [None]:
# Basic
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import os
import random
from tqdm.autonotebook import tqdm
import string
from collections import Counter
import re

# Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")
from PIL import Image
from wordcloud import WordCloud, STOPWORDS

# NLP
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

# Data Description

train/test.csv - the training set metadata. Each row contains the data for a single posting. Multiple postings might have the exact same image ID, but with different titles or vice versa.
* `posting_id` - the ID code for the posting.
* `image` - the image id/md5sum.
* `image_phash` - a perceptual hash of the image.
* `title` - the product description for the posting.
* `label_group` - ID code for all postings that map to the same product. Not provided for the test set.

train/test images - the images associated with the postings.

sample_submission.csv - a sample submission file in the correct format.
* `posting_id` - the ID code for the posting.
* `matches` - Space delimited list of all posting IDs that match this posting. Posts always self-match. Group sizes were capped at 50, so there's no need to predict more than 50 matches.

In [None]:
data_dir = '../input/shopee-product-matching'

train_file_path = os.path.join(data_dir, 'train.csv')
test_file_path = os.path.join(data_dir, 'test.csv')
sample_sub_file_path = os.path.join(data_dir, 'sample_submission.csv')
train_images_path = os.path.join(data_dir, 'train_images')
test_images_path = os.path.join(data_dir, 'test_images')

print(f'Train file: {train_file_path}')
print(f'Test file: {test_file_path}')
print(f'Sample Sub file: {sample_sub_file_path}')
print(f'Train Imaes Path: {train_images_path}')
print(f'Test Images Path: {test_images_path}')

In [None]:
RANDOM_SEED = 42

In [None]:
def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
seed_everything()

In [None]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
sub_df = pd.read_csv(sample_sub_file_path)

# Training Data

In [None]:
train_df.sample(10)

Let's check the size of the dataset...

In [None]:
train_df.shape

And number of unique values in each column...

In [None]:
train_df.nunique()

So we have 11014 unique groups...  

What do similar images in same group look like?

In [None]:
def show_image(class_num, examples=2, train_df=train_df, train_images_path=train_images_path):
    image_list = train_df[train_df['label_group'] == class_num]['image'].sample(frac=1)[:examples].to_list()
    plt.figure(figsize=(20,10))
    for i, img in enumerate(image_list):
        full_path = os.path.join(train_images_path, img)
        img = Image.open(full_path)
        plt.subplot(1 ,examples, i%examples +1)
        plt.axis('off')
        plt.imshow(img)
        plt.title(f'Class: {class_num}')

In [None]:
nums = random.sample(list(train_df.label_group.unique()), 3)
for num in nums:
    show_image(num)

In [None]:
nums = random.sample(list(train_df.label_group.unique()), 3)
for num in nums:
    show_image(num)

Got it... So there can be subtle differences in the image like background, test and stickers. But the main product to remain exactly same. But the image can be upside down or a different angle. This looks like an interesting problem...  

Let's see what the titles tell us...

In [None]:
word_count = [len(x.split()) for x in train_df['title'].tolist()]
barplot_dim = (12, 6)
ax = plt.subplots(figsize =barplot_dim);
ax = sns.distplot(word_count, kde=False);
ax.set_ylabel('No. of Observations', size=15)
ax.set_xlabel('No. of Words', size=15)
ax.set_title('Title Word Count Distribution', size=20);

In [None]:
train_df.sample(10)

Some texts look messy and having irrelevant texts like special charecters... So let's clean that...

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = str(text).lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
tqdm.pandas()
train_df['title'] = train_df['title'].progress_apply(text_cleaning)

In [None]:
tqdm.pandas()
test_df['title'] = test_df['title'].progress_apply(text_cleaning)

In [None]:
word_count = [len(x.split()) for x in train_df['title'].tolist()]
barplot_dim = (12, 6)
ax = plt.subplots(figsize =barplot_dim);
ax = sns.distplot(word_count, kde=False);
ax.set_ylabel('No. of Observations', size=15)
ax.set_xlabel('No. of Words', size=15)
ax.set_title('Title Word Count Distribution', size=20);

Let's see some of the most commonly used words...

In [None]:
temp_df = pd.DataFrame()
temp_df['temp_list'] = train_df['title'].apply(lambda x :str(x).split())
top = Counter([item for sublist in temp_df['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(25))
temp.columns = ['Common Words', 'Count']
temp.style.background_gradient(cmap='Reds')

In [None]:
text = ' '.join(train_df['title'])
wordcloud = WordCloud(background_color='white', stopwords=STOPWORDS, width=2560, height=1440).generate(text)

barplot_dim = (15, 15)
ax = plt.subplots(figsize=barplot_dim, facecolor='w')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

# Naive Model
Let's assume the products with the exact same title are similar products. So let's group them basis that...

In [None]:
def prepare_text(text, nlp=nlp):
    '''
    Returns the text after stop-word removal and lemmatization.
    text - Sentence to be processed
    nlp - Spacy NLP model
    '''
    doc = nlp(text)
    lemma_list = [token.lemma_ for token in doc if not token.is_stop]
    lemmatized_sentence = ' '.join(lemma_list)
        
    return lemmatized_sentence

In [None]:
tqdm.pandas()
test_df['title'] = test_df['title'].progress_apply(prepare_text)

In [None]:
# from https://www.kaggle.com/isaienkov/shopee-data-understanding-and-analysis

mask = test_df.groupby(['title']).count().reset_index()['title'].tolist()
a = []
b = []
for item in mask:
    res = test_df[test_df['title']== item]['posting_id'].tolist()
    ans = ''
    for id_item in res:
        ans = ans + str(id_item) + ' '
    ans = ans[:-1]
    for id_item in res:
        a.append(id_item)
        b.append(ans)

In [None]:
submission = pd.DataFrame()
submission['posting_id'] = a
submission['matches'] = b
submission.head()

Another feature we can consider is the phash feature... If both images have same phash we can naively assume them to be same... So let's take care of that too...

In [None]:
mapping_dict_phash = test_df.groupby('image_phash')['posting_id'].apply(list).to_dict()
test_df['matches_temp'] = test_df['image_phash'].map(mapping_dict_phash)
test_df['matches_temp'] = test_df['matches_temp'].apply(lambda x: ' '.join(x))

submission_map = test_df[['posting_id', 'matches_temp']].set_index('posting_id').to_dict()['matches_temp']

In [None]:
submission['matches_temp'] = submission['posting_id'].map(submission_map)
submission['matches_temp'] = submission['matches_temp'] + ' ' + submission['matches']
submission['matches_temp'] = submission['matches_temp'].apply(lambda x: x.split())
submission['matches_temp'] = submission['matches_temp'].apply(lambda x: set(x))
submission['matches'] = submission['matches_temp'].apply(lambda x: ' '.join(x))
submission.drop('matches_temp', axis=1, inplace=True)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

**If you found this notebook useful and use parts of it in your work, please don't forget to show your appreciation by upvoting this kernel. That keeps me motivated and inspires me to write and share these public kernels.** 😊