In [63]:
import pandas as pd
import numpy as np

# import BERT tokenization

In [64]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [65]:
import tensorflow as tf
import tensorflow_hub as hub
from keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from bert.tokenization import FullTokenizer

In [66]:
df = pd.read_csv("Data/preparation.csv")
df = df[['name_and_description'	,'product_category_tree_target']]

In [67]:
df.head(3)

Unnamed: 0,name_and_description,product_category_tree_target
0,Product name : Elegance Polyester Multicolor A...,Home Furnishing > Curtains & Accessories
1,Product name : Sathiyas Cotton Bath Towel\nDes...,Baby Care > Baby Bath & Skin
2,Product name : Eurospa Cotton Terry Face Towel...,Baby Care > Baby Bath & Skin


# Split data

In [68]:
# Define the ratio for the split
split_ratio = 0.8  # 80% training, 20% testing

# Calculate the number of rows for training and testing
total_rows = df.shape[0]
train_rows = int(split_ratio * total_rows)
test_rows = total_rows - train_rows

# Split the DataFrame into training and testing sets
train_data = df.iloc[:train_rows, :]
test_data = df.iloc[train_rows:, :]


# Label encoding of labels

In [69]:
label = preprocessing.LabelEncoder()
y = label.fit_transform(train_data['product_category_tree_target'])
y = to_categorical(y)
print(y[:5])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


# Build a BERT layer

In [70]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=True)

# Encoding the text

In [71]:


#https://stackoverflow.com/questions/67043468/unparsedflagaccesserror-trying-to-access-flag-preserve-unused-tokens-before-f
# Set the preserve_unused_tokens flag to True
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [72]:
labels = label.classes_
print(labels)

['Baby Care > Baby & Kids Gifts' 'Baby Care > Baby Bath & Skin'
 'Baby Care > Baby Bedding' 'Baby Care > Baby Grooming'
 'Baby Care > Diapering & Potty Training' 'Baby Care > Feeding & Nursing'
 'Baby Care > Furniture & Furnishings' 'Baby Care > Infant Wear'
 'Baby Care > Strollers & Activity Gear'
 'Beauty and Personal Care > Bath and Spa'
 'Beauty and Personal Care > Body and Skin Care'
 'Beauty and Personal Care > Combos and Kits'
 'Beauty and Personal Care > Eye Care'
 'Beauty and Personal Care > Fragrances'
 'Beauty and Personal Care > Hair Care'
 'Beauty and Personal Care > Health Care'
 'Beauty and Personal Care > Makeup'
 "Beauty and Personal Care > Women's Hygiene"
 'Computers > Computer Components' 'Computers > Computer Peripherals'
 'Computers > Laptop Accessories' 'Computers > Laptops'
 'Computers > Network Components' 'Computers > Software'
 'Computers > Storage' 'Computers > Tablet Accessories'
 'Home Decor & Festive Needs > Candles & Fragrances'
 'Home Decor & Festive Ne

In [73]:
len_cat = len(labels)

In [74]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    
    lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(len_cat, activation='softmax')(lay)  # len_cart = N de cat.
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [75]:
max_len = 250
train_input = bert_encode(train_data.name_and_description.values, tokenizer, max_len=max_len)
test_input = bert_encode(test_data.name_and_description.values, tokenizer, max_len=max_len)
train_labels = y

In [76]:
model = build_model(bert_layer, max_len=max_len)
model.summary()



Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_word_ids (InputLayer  [(None, 250)]                0         []                            
 )                                                                                                
                                                                                                  
 input_mask (InputLayer)     [(None, 250)]                0         []                            
                                                                                                  
 segment_ids (InputLayer)    [(None, 250)]                0         []                            
                                                                                                  
 keras_layer_3 (KerasLayer)  [(None, 768),                1094822   ['input_word_ids[0][0]',

In [77]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_sh = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1
)

Epoch 1/3
 2/21 [=>............................] - ETA: 10:34 - loss: 4.1503 - accuracy: 0.0156    