In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import spacy
import re
import time
import warnings

from tensorflow import keras, convert_to_tensor
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from matplotlib import pyplot as plt
from dotenv import load_dotenv

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [2]:
# NLP object creation :

nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner"])  # disabling a few components should speed us up a bit


In [3]:
# Reading cleaned dataset, using pickle allows type preservation (spacy doc, np array etc.)

df = pd.read_pickle(filepath_or_buffer="../data/ecommerce_cleaned.pkl")


In [4]:
df.head(n=1)


Unnamed: 0,product_name,doc_desc,lem_desc,first_category,product_specifications,image,description,category_tree
0,Elegance Polyester Multicolor Abstract Eyelet ...,"(key, elegance, polyester, multicolor, abstrac...","[key, elegance, polyester, multicolor, abstrac...",home furnishing,"{""product_specification""=>[{""key""=>""Brand"", ""v...",55b85ea15a1536d46b7190ad6fff8ce7.jpg,Key Features of Elegance Polyester Multicolor ...,"[Home Furnishing, Curtains & Accessories, Curt..."


In [5]:
df.columns


Index(['product_name', 'doc_desc', 'lem_desc', 'first_category',
       'product_specifications', 'image', 'description', 'category_tree'],
      dtype='object')

In [6]:
model_columns = ["lem_desc", "first_category"]
df_model = df[model_columns].copy()


In [7]:
def list_to_str(row):
    """
    Turns lem_desc, a list of tokens, into a string for CV and TF-IDF usage,
    returns string
    """
    desc_list = row["lem_desc"]
    desc_txt = " ".join(desc_list)
    return desc_txt



In [8]:
df_model.rename(columns={"first_category": "category"}, inplace=True)

df_model["desc"] = df_model.apply(list_to_str, axis=1)


In [9]:
le = LabelEncoder()

df_model["enc_category"] = le.fit_transform(df_model["category"])


In [10]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [11]:
# Checking the average lenght of the lists of token in lem_desc to adapt tensor size :

def get_desc_len(row):
    return int(len(row["lem_desc"]))


df_model["desc_size"] = df_model.apply(func=get_desc_len, axis=1)

df_model["desc_size"].describe()


count    1024.00000
mean       38.65918
std        42.32380
min         3.00000
25%         9.00000
50%        19.00000
75%        58.00000
max       272.00000
Name: desc_size, dtype: float64

In [12]:
# Splitting before tokenizing :

X_train, X_test, y_train, y_test = train_test_split(
    df_model["desc"],
    df_model["enc_category"],
    test_size=0.30,
    random_state=123
)


In [13]:
X_test_tokenized = tokenizer.batch_encode_plus(
    X_test.tolist(),
    max_length=256,  # Reducing the default size of the tensor
    padding=True,
    return_tensors="tf",
    truncation=True
)


Considering the min/q1/q3 and max of the description lenghts, lowering the tensor size to 256 might be faster without truncating too many descriptions and without too much unnecessary padding

In [14]:
# Tokenizing the training set :

X_train_tokenized = tokenizer.batch_encode_plus(
    X_train.tolist(),
    max_length=256,  # Reducing the default size of the tensor
    padding=True,
    return_tensors="tf",
    truncation=True
)


In [15]:
## Checking input tensors : (uncomment to check CLS and SEP (101 and 102))
# X_train_tokenized["input_ids"][0]


In [16]:
# Defining ins and outs tensors

input_tensors = X_train_tokenized["input_ids"]
output_tensors = convert_to_tensor(y_train)

unique_cat_count = df_model["enc_category"].unique().__len__()


In [17]:
output_tensors = keras.utils.to_categorical(
    output_tensors,
    num_classes=unique_cat_count
    )

bert_model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=unique_cat_count
    )

# Use the CategoricalCrossentropy loss function
loss = keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam()
metric = keras.metrics.Accuracy("accuracy")

bert_model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=[metric]
)

# BERT Large ?


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
time_fit_zero = time.perf_counter()

bert_model.fit(input_tensors, output_tensors, epochs=20, verbose=1)

time_fit_end = time.perf_counter()

print(f"Fitting took {time_fit_end - time_fit_zero} s")


2022-12-27 16:56:42.784995: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [None]:
original_labels = np.argmax(output_tensors, axis=1)

# Print the original labels
print(original_labels)


In [None]:
test_tensors = X_test_tokenized["input_ids"]

In [None]:
predictions = bert_model.predict(test_tensors)


In [None]:
original_test_labels = np.argmax(predictions[0], axis=1)

In [None]:
original_test_labels

In [None]:
print(accuracy_score(y_test, original_test_labels))
