In [1]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import pandas as pd
import numpy as np 
import tensorflow as tf 
import tensorflow_hub as hub 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tokenization 
import warnings
warnings.filterwarnings("ignore")
import math
import re
import collections
import langid
import fasttext
import string 
from transformers import BertTokenizer,TFBertModel

In [2]:
tokenizer = BertTokenizer.from_pretrained("cahya/bert-base-indonesian-522M")
model = TFBertModel.from_pretrained("cahya/bert-base-indonesian-522M")

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/545M [00:00<?, ?B/s]

Some layers from the model checkpoint at cahya/bert-base-indonesian-522M were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at cahya/bert-base-indonesian-522M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [3]:
test = pd.read_csv("../input/shopee-product-matching/train.csv")

In [4]:
# Take a look at the datas
test.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [5]:
# take a look at the title feature
test.title

0                                Paper Bag Victoria Secret
1        Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...
2              Maling TTS Canned Pork Luncheon Meat 397 gr
3        Daster Batik Lengan pendek - Motif Acak / Camp...
4                        Nescafe \xc3\x89clair Latte 220ml
                               ...                        
34245    Masker Bahan Kain Spunbond Non Woven 75 gsm 3 ...
34246      MamyPoko Pants Royal Soft - S 70 - Popok Celana
34247    KHANZAACC Robot RE101S 1.2mm Subwoofer Bass Me...
34248    Kaldu NON MSG HALAL Mama Kamu Ayam Kampung , S...
34249    FLEX TAPE PELAPIS BOCOR / ISOLASI AJAIB / ANTI...
Name: title, Length: 34250, dtype: object

===> two obvious thing can be underlined throw this first looking at title feature :
  - There is series of caratcters which don't bring any meaning ; we should explorate their occurences on the data which represent title.
  - Title are represented randomly with upper and lower case form .So we should lower cased all words and caracters which are presents in our title datas.

In [6]:
# Examinate the frequency of the unusual caracters , observed above in data title.
df=test.title.str.findall(r"(\\\w)")
df = pd.DataFrame({"regular_expression":df.values})
df["regular_expression"] = df["regular_expression"].astype("str")
df = df.loc[df["regular_expression"] !="[]",:]
print(f"Theses random caracters appear {len(df)} timses in our titles")

Theses random caracters appear 705 timses in our titles


===> We need to handle theses random caracters in our processing steps of datas.

===> We should also mention that there is some caracters and symbols that we should get rid of them because they don't bring any value for our downstream task. theses caracters like : "/" ,"-","/"...

In [7]:
# Extract the top 100 most common words in our titles.
corpus = []
for tx in test["title"].values:
    corpus.extend(tx.lower().split())

counter = collections.Counter(corpus)
list(zip(*(counter.most_common(100))))

[('/',
  '-',
  'anak',
  'wanita',
  'original',
  '1',
  'murah',
  'tas',
  'dan',
  'pria',
  'masker',
  'untuk',
  '|',
  'bayi',
  '&',
  'ml',
  'anti',
  'set',
  '2',
  'baby',
  '3',
  'tangan',
  'air',
  'isi',
  'kaos',
  'motif',
  'sepatu',
  'warna',
  'baju',
  'cream',
  'mini',
  'premium',
  'import',
  'celana',
  'alat',
  'bahan',
  'gr',
  'polos',
  'rambut',
  'led',
  '+',
  'fashion',
  'serum',
  'mask',
  'jam',
  'pcs',
  'x',
  'in',
  'lampu',
  'mainan',
  'termurah',
  'by',
  'case',
  '(',
  'bag',
  '6',
  'korea',
  'hijab',
  'rak',
  'panjang',
  '4',
  '5',
  'kain',
  'usb',
  'sarung',
  'karakter',
  ')',
  'dress',
  'sandal',
  'sabun',
  'pro',
  'hp',
  'hitam',
  'plastik',
  'cod',
  '100',
  'paket',
  'plus',
  'free',
  'bisa',
  'white',
  'jumbo',
  'dengan',
  'botol',
  '10',
  'new',
  'putih',
  'tempat',
  'box',
  'gamis',
  'holder',
  'size',
  'mobil',
  'kaki',
  'gel',
  'super',
  'wajah',
  'bpom',
  'dompet',
  'jil

In [8]:
# Extract the least common words in our titles.
list(zip(*(counter.most_common())))[-101:-1]

[('/',
  '-',
  'anak',
  'wanita',
  'original',
  '1',
  'murah',
  'tas',
  'dan',
  'pria',
  'masker',
  'untuk',
  '|',
  'bayi',
  '&',
  'ml',
  'anti',
  'set',
  '2',
  'baby',
  '3',
  'tangan',
  'air',
  'isi',
  'kaos',
  'motif',
  'sepatu',
  'warna',
  'baju',
  'cream',
  'mini',
  'premium',
  'import',
  'celana',
  'alat',
  'bahan',
  'gr',
  'polos',
  'rambut',
  'led',
  '+',
  'fashion',
  'serum',
  'mask',
  'jam',
  'pcs',
  'x',
  'in',
  'lampu',
  'mainan',
  'termurah',
  'by',
  'case',
  '(',
  'bag',
  '6',
  'korea',
  'hijab',
  'rak',
  'panjang',
  '4',
  '5',
  'kain',
  'usb',
  'sarung',
  'karakter',
  ')',
  'dress',
  'sandal',
  'sabun',
  'pro',
  'hp',
  'hitam',
  'plastik',
  'cod',
  '100',
  'paket',
  'plus',
  'free',
  'bisa',
  'white',
  'jumbo',
  'dengan',
  'botol',
  '10',
  'new',
  'putih',
  'tempat',
  'box',
  'gamis',
  'holder',
  'size',
  'mobil',
  'kaki',
  'gel',
  'super',
  'wajah',
  'bpom',
  'dompet',
  'jil

In [9]:
# let's the detect the language of theses titles.
languages = []
for tit in test.title.values :
    languages.append(langid.classify(tit)[0])

In [10]:
languages = collections.Counter(languages)

In [11]:
# Display the table of languages used in the title description
languages = pd.DataFrame(languages,index=["frequency"])
languages

Unnamed: 0,en,mt,id,fr,xh,tl,fi,es,it,af,...,sq,nb,qu,sk,gl,nn,fo,an,ga,wa
frequency,13286,766,6762,912,77,590,533,1032,1182,74,...,9,34,10,22,17,35,5,14,10,2


===> As we can see on  the table above, there is many languages used in our title descriptions.

In [12]:
def clean_title(tx):
    tx = tx.lower()
    tx = re.sub(r"\\(\w)+(\d)+"," ",tx)
    #tx = re.sub(r"[\\/-+_*[]()]"," ",tx)
    tx = re.sub(r"(\d+),(\d+)",r"\1.\2",tx)
    for p in string.punctuation :
        if p !="." :
          tx=tx.replace(p," ")
    tx = re.sub(r"\s+"," ",tx)
    
    return tx

In [13]:
test["cleaned_title"] = test["title"].map(clean_title)

In [14]:
lb= LabelEncoder()
test["encoded_label_group"] = lb.fit_transform(test["label_group"].values)

In [15]:
#sent= tokenizer.encode_plus(test.loc[0,"cleaned_title"],add_special_tokens=True,padding="max_length",max_length=\
                                  #70)

In [16]:
def bert_encode(seq_data,tokenizer,max_len=512):
    all_tokens = []
    all_masks = []
    
    for seq in seq_data :
        sent= tokenizer.encode_plus(seq,add_special_tokens=True,padding="max_length",max_length=\
                                  max_len,truncation=True)
        #tx = tx[:max_len-2]
        #tx = ['[CLS]'] + tx + ['[SEP]']
        #tokens = tokenizer.convert_tokens_to_ids(tx)
        #seq_len = len(tokens)
        #pad_len = max_len - seq_len 
        #tokens = tokens + [0] * pad_len
        #masks = [1] * seq_len + [0] * pad_len
        #segments = [0] * max_len
        
        all_tokens.append(sent.get("input_ids"))
        all_masks.append(sent.get("attention_mask"))
        #all_segments.append(segments)
    all_tokens = np.vstack(all_tokens)
    all_masks = np.vstack(all_masks)
    return all_tokens,all_masks
    #,np.array(all_segments)     
        

In [17]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [18]:
def build_model(bert_layer,N_CLASSES,max_len=512):
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32')
    tokens = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32)
    #masks = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32)
    #segments = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32)
    y = tf.keras.layers.Input(shape=(),dtype=tf.int32)
    bert = bert_layer.bert([tokens])
    cls = bert.pooler_output 
    x = tf.keras.layers.BatchNormalization()(cls)
    #x = tf.keras.layers.Dropout(0.4)(x)
    #x = tf.keras.layers.Dense(emb_size)(x)
    #emb = tf.keras.layers.BatchNormalization()(x)
    mar = margin([x,y])
    output = tf.keras.layers.Softmax()(mar)
    model = tf.keras.models.Model(inputs=[tokens,y],outputs=[output])
    return model 
   

In [19]:
def prepare_data():
    N_classes = test["label_group"].nunique()
    #bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",\
                               #trainable=True)
    #vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    #do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    #tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)
    xtr,xts,ytr,yts = train_test_split(test["cleaned_title"].values,test["encoded_label_group"].values,\
                                      test_size =0.33,stratify=test["encoded_label_group"].values,\
                                      random_state=42)
    mx_len = 0
    for tx in test["title"].values :
        seq = tx.split()
        if len(seq) > mx_len :
            mx_len = len(seq)
    Xtr = bert_encode(xtr,tokenizer,max_len=mx_len)
    Xts = bert_encode(xts,tokenizer,max_len=mx_len)
    return Xtr,Xts,ytr,yts,N_classes,mx_len

In [20]:
Xtr,Xts,ytr,yts,N_classes,max_len = prepare_data()

In [21]:
print(f"The max sequence length to use equal : {max_len}")

The max sequence length to use equal : 61


In [22]:
print(f"The number of distinct class ={N_classes}")

The number of distinct class =11014


In [23]:
train_data = (Xtr[0],ytr)
val_data = (Xts[0],yts)

In [24]:
model = build_model(model,N_classes,max_len=max_len)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),loss=tf.keras.losses.sparse_categorical_crossentropy,\
             metrics="accuracy")

In [25]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 61)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 110617344   input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 768)          3072        bert[0][1]                       
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None,)]            0                                            
______________________________________________________________________________________________

In [26]:
model.fit(train_data,ytr,validation_data=(val_data,yts),epochs=30,batch_size = 16)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f6884545510>

In [27]:
model.save("model")

In [28]:
def get_layer_index(model, layer_name, not_found=None):
    """get model's layer index by layer's name"""
    for i, layer in enumerate(model.layers):
        if layer.name == layer_name:
            return i
    return not_found

In [29]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 61)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 110617344   input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 768)          3072        bert[0][1]                       
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None,)]            0                                            
______________________________________________________________________________________________

In [30]:
md = tf.keras.models.Model(inputs=model.layers[1].input,outputs=model.layers[1].output)

In [31]:
md.save("bert_embedding")