In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import spacy
import re
import time
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from collections import Counter
from umap import UMAP

sys.path.append("../")

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [None]:
# NLP object creation :

nlp = spacy.load("en_core_web_lg")

# nlp._config  # Checking : tok2vec is in config by default.


In [None]:
# Reading cleaned dataset, using pickle allows type preservation (spacy doc, np array etc.)

df = pd.read_pickle(filepath_or_buffer="../data/ecommerce_cleaned.pkl")


In [None]:
df.head(n=1)


# 1 : Text classification using sparse representation (Bag of Words & TF-IDF)

- We will first try to classify the products using the sparse representation of their descriptions. We will use Bag of Words (abbreviated as BoW or CV) and TF-IDF to represent the product descriptions. We will try to predict the categories using a Multinomial Naive Bayes model.
- As this approach will generate sparse vectors (vectors containing mostly zeros), we will try to apply a dimension reduction technique, UMAP, to reduce the size of the vectors to 2 components and attempt a classification on these components.
- We will evaluate the models using the average accuracy of the predictions, sklearn's classification report, and, if necessary, a confusion matrix.

<i>Note : Since we applied tokenization, stemming and lemmentazation in notebook `nb_01` and we preserved the types by using a pickle format, it is not necessary to clean the text in this notebook.</i>

## 1.1 Feature exctraction :
&emsp;We will apply CountVectorizer & TfidfVectorizer on the whole corpus first.

In [None]:
# Stopwords have already been cleaned so we dont need to add them to the vectorizer

count_vectorizer = CountVectorizer()
tf_idf_vectorizer = TfidfVectorizer()


In [None]:
def get_vector(row):
    """
    returns the vector of the spacy.doc object in col doc_desc
    """
    desc_doc = row["doc_desc"]
    desc_vec = desc_doc.vector
    return desc_vec


def list_to_str(row):
    """
    Turns lem_desc, a list of tokens, into a string for CV and TF-IDF usage,
    returns string
    """
    desc_list = row["lem_desc"]
    desc_txt = " ".join(desc_list)
    return desc_txt


In [None]:
df["text_vec"] = df.apply(func=get_vector, axis=1)
df["lem_desc_txt"] = df.apply(func=list_to_str, axis=1)


In [None]:
with warnings.catch_warnings():  # Storing the results of the transformation in variables for UMAP
    warnings.simplefilter('ignore')
    count_vectorizer_transform = count_vectorizer.fit_transform(df["lem_desc_txt"])
    tf_idf_transform = tf_idf_vectorizer.fit_transform(df["lem_desc_txt"])


In [None]:
# Both should be of shape (len(df), n)

print(count_vectorizer_transform.shape)
print(tf_idf_transform.shape)


In [None]:
df.__len__()


In [None]:
# We will use Multinomial Naive Bayes so we need to encode the categories as integers and not txt:

le = LabelEncoder()
df["enc_category"] = le.fit_transform(df["first_category"])
df[["first_category", "enc_category"]].head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["lem_desc_txt"],
    df["enc_category"],
    test_size=0.3,
    random_state=123
    )

# The split will be common to bow & tf-idf


# 1.2 : Predictions using Bag of words and tf_idf :

## 1.2.1 : Bag of words :

In [None]:
# applying count vectorizer previously fitted on the corpus on the data :
X_train_bow = count_vectorizer.transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)


In [None]:
# Creating and fitting model :

clf = MultinomialNB()

clf.fit(X=X_train_bow, y=y_train)


In [None]:
y_pred_bow = clf.predict(X_test_bow)

accuracy_bow = accuracy_score(y_true=y_test, y_pred=y_pred_bow)

print(accuracy_bow)


In [None]:
inv_bow_pred = le.inverse_transform(y_pred_bow)
inv_true = le.inverse_transform(y_test)

# This reverses the encoding on labels


In [None]:
print(classification_report(y_true=inv_true, y_pred=inv_bow_pred))


#### Observations :

- The model achieves a very high average precision of 90/91%.
- It is important to note that, due to the limited amount of data (see support), the model might benefit training on a larger set of data.

In [None]:
conf_matrix_bow = pd.DataFrame(confusion_matrix(y_true=inv_true, y_pred=inv_bow_pred))
conf_matrix_bow.columns = le.inverse_transform(conf_matrix_bow.columns)
conf_matrix_bow.index = le.inverse_transform(conf_matrix_bow.index)


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(4, 3),
    dpi=pc_dpi,
)

sns.heatmap(conf_matrix_bow, annot=True, cmap="YlGnBu", xticklabels=True, yticklabels=True)

###
# Titles/Lables
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tick_params(labelsize=3)
fig.suptitle("Confusion matrix : classification on bag of words approach")
#
###

plt.show()


#### Confusion matrix :

- This matrix shows in details what could be observed in the classification report : the model's overall performances are good but might benefit from more data.
- It's also interesting to note that there were 5 items in the "multimedia" class that were misclassified as "watches". It is logical as these two categories share a lot of technical terms. This is might be the source of the errors.

## 1.2.2 : TF-IDF 


In [None]:
X_train_tfidf = tf_idf_vectorizer.transform(X_train)
X_test_tfidf = tf_idf_vectorizer.transform(X_test)


In [None]:
clf = MultinomialNB()

clf.fit(X=X_train_tfidf, y=y_train)


In [None]:
y_pred_tfidf = clf.predict(X_test_tfidf)

accuracy_tfidf = accuracy_score(y_true=y_test, y_pred=y_pred_tfidf)

print(accuracy_tfidf)


In [None]:
inv_tfidf_pred = le.inverse_transform(y_pred_tfidf)
inv_true = le.inverse_transform(y_test)

# This reverses the encoding on labels


In [None]:
print(classification_report(y_true=inv_true, y_pred=inv_tfidf_pred))


In [None]:
conf_matrix_tfidf = pd.DataFrame(confusion_matrix(y_true=inv_true, y_pred=inv_tfidf_pred))
conf_matrix_tfidf.columns = le.inverse_transform(conf_matrix_tfidf.columns)
conf_matrix_tfidf.index = le.inverse_transform(conf_matrix_tfidf.index)


In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(4, 3),
    dpi=pc_dpi,
)

sns.heatmap(conf_matrix_tfidf, annot=True, cmap="YlGnBu", xticklabels=True, yticklabels=True)

###
# Titles/Lables
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tick_params(labelsize=3)
fig.suptitle("Confusion matrix : classification with tf-idf approach")
#
###

plt.show()


#### Observations :
- This model performs quite well overall, with better performance in some categories than BoW but not as well in others (e.g. baby care and home furnishing).
- We could potentially improve the model by providing it with a larger training set and using hyperparameter tuning techniques like GridSearchCV to adjust the alpha of the Multinomial Naive Bayes algorithm.

## 1.3 : UMAP on vectors from count_vectorizer and tf-idf :

- This allows to reduce the dimensions to 2838 to 2 (It is worth noting that these vectors are expected to grow with a larger dataset)
- UMAP is globally faster than T-SNE for often better results


### 1.3.1 : UMAP reduction
- Using UMAP to reduce the data into two components for each approach
- Using MinMax scaler on the data to be able to use Naive Bayes (which does not work on negative values)

In [None]:
df_reduced = df.copy()  # Separating the two datasets


In [None]:
# Dimension reduction using UMAP, settings by default, 2 components

umap = UMAP(n_components=2, n_jobs=-1)

umap_cv = umap.fit_transform(count_vectorizer_transform)
umap_tfidf = umap.fit_transform(tf_idf_transform)

print(umap_cv.shape)
print(umap_tfidf.shape)


In [None]:
# saving component 0 and 1 for CountVectorizer (umap_cv) and tf-idf (umap_tfidf)
# assigning 2 cols for each method

df_reduced["umap_cv_comp_0"] = np.nan
df_reduced["umap_cv_comp_1"] = np.nan
df_reduced["umap_tfidf_comp_0"] = np.nan
df_reduced["umap_tfidf_comp_1"] = np.nan

for index in range(0, len(df)):
    df_reduced.loc[index, "umap_cv_comp_0"] = umap_cv[index][0]
    df_reduced.loc[index, "umap_cv_comp_1"] = umap_cv[index][1]
    df_reduced.loc[index, "umap_tfidf_comp_0"] = umap_tfidf[index][0]
    df_reduced.loc[index, "umap_tfidf_comp_1"] = umap_tfidf[index][1]


In [None]:
mmx = MinMaxScaler()  # Avoiding values < 0

df_reduced["umap_cv_comp_0"] = mmx.fit_transform(df_reduced["umap_cv_comp_0"].values.reshape(-1, 1))
df_reduced["umap_cv_comp_1"] = mmx.fit_transform(df_reduced["umap_cv_comp_1"].values.reshape(-1, 1))
df_reduced["umap_tfidf_comp_0"] = mmx.fit_transform(df_reduced["umap_tfidf_comp_0"].values.reshape(-1, 1))
df_reduced["umap_tfidf_comp_1"] = mmx.fit_transform(df_reduced["umap_tfidf_comp_1"].values.reshape(-1, 1))


In [None]:
# Display original text next to reduced components for cv and tf-idf :

display_cols = [
    "lem_desc_txt", "umap_cv_comp_0", "umap_cv_comp_1",
    "umap_tfidf_comp_0", "umap_tfidf_comp_1"
    ]

display(df_reduced[display_cols])


#### Visualisations :

##### BoW :

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(4, 4),
    dpi=pc_dpi,
)

g = sns.scatterplot(data=df_reduced, x="umap_cv_comp_0", y="umap_cv_comp_1", hue="first_category", ax=ax1)

###
# Titles/Lables
sns.move_legend(
    ax1, "upper right",
    ncol=2,
    title=None,
    frameon=True,
)
plt.setp(ax1.get_legend().get_texts(), fontsize="4")
ax1.legend(bbox_to_anchor= (1,1))
fig.suptitle("UMAP reduced count vectorizer with first categories of products")
#
###

plt.show()


##### TF-IDF :

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(5, 5),
    dpi=pc_dpi,
)

g = sns.scatterplot(data=df_reduced, x="umap_tfidf_comp_0", y="umap_tfidf_comp_1", hue="first_category", ax=ax1)

###
# Titles/Lables
sns.move_legend(
    ax1, "upper right",
    ncol=2,
    title=None,
    frameon=True,
)
plt.setp(ax1.get_legend().get_texts(), fontsize="5")
ax1.legend(bbox_to_anchor=(1, 1))
fig.suptitle("UMAP reduced TF-IDF with first categories of products")
#
###

plt.show()


##### Observation :

- On both methods, the categories do not seem to exhibit major differences
- The loss of information might be too important for this approach to be relevant
- Reduction on tf-idf might be a little bit more pertinent as we see some clusters like baby care and watches isolated on the scatterplot

### 1.3.2 : Classification on Bag of Words, UMAP reduced data

In [None]:
cols_bow = ["umap_cv_comp_0", "umap_cv_comp_1", "enc_category"]

df_bow_umap = df_reduced[cols_bow]  # Creating a df for bag of words


In [None]:
X_train_bow_umap, X_test_bow_umap, y_train_bow_umap, y_test_bow_umap = train_test_split(
    df_bow_umap[["umap_cv_comp_0", "umap_cv_comp_1"]],
    df_bow_umap["enc_category"],
    test_size=0.3,
    random_state=123  # Keeping the same seed
    )


In [None]:
clf = MultinomialNB()

clf.fit(X_train_bow_umap, y_train_bow_umap)

predictions = clf.predict(X_test_bow_umap)


In [None]:
accuracy_bow_umap = accuracy_score(y_true=y_test_bow_umap, y_pred=predictions)

print(accuracy_bow_umap)


In [None]:
# Ok that seems bad, let's try a confusion matrix
conf_matrix = confusion_matrix(y_test_bow_umap, predictions)

confusion_matrix_df_bow_umap = pd.DataFrame(data=conf_matrix)


In [None]:
display(confusion_matrix_df_bow_umap)


##### Observation :

- Overall, the loss of information during UMAP reduction seems to be too large for the model to be precise, with an accuracy of only 24.4%. It is likely that much of the variance was lost during the dimension reduction process, making it difficult for the classification model to be relevant on BoW.

### 1.3.2 : Classification on TF-IDF, UMAP reduced data

In [None]:
cols_tf_idf = ["umap_tfidf_comp_0", "umap_tfidf_comp_1", "enc_category"]

df_tf_idf_umap = df_reduced[cols_tf_idf]  # Creating a df for tf-idf


In [None]:
X_train_tfidf_umap, X_test_tfidf_umap, y_train_tfidf_umap, y_test_tfidf_umap = train_test_split(
    df_tf_idf_umap[["umap_tfidf_comp_0", "umap_tfidf_comp_1"]],
    df_tf_idf_umap["enc_category"],
    test_size=0.3,
    random_state=123  # same seed
    )


In [None]:
clf = MultinomialNB()

clf.fit(X_train_tfidf_umap, y_train_tfidf_umap)

predictions = clf.predict(X_test_tfidf_umap)


In [None]:
accuracy_tfidf_umap = accuracy_score(y_true=y_test_tfidf_umap, y_pred=predictions)

print(accuracy_tfidf_umap)


In [None]:
conf_matrix = confusion_matrix(y_test_tfidf_umap, predictions)

confusion_matrix_df_umap = pd.DataFrame(data=conf_matrix)

display(confusion_matrix_df_umap)


##### Observation :

- Based on the confusion matrix and the reported accuracy of 14.4%, it looks like the model is not performing well. Most of the predictions made by the model are incorrect, with a high number of false positives and false negatives.
- The dimensional reduction might be to blame for the poor performance of the model. We should consider discarding the dimensional reduction approach in order to improve the model's performance.

In [None]:
del df_bow_umap
del df_tf_idf_umap
del accuracy_bow_umap
del accuracy_tfidf_umap
# Frees a bit of memory


# 1.4 : Conclusion on BoW and TF-IDF

- Dimensional reduction is not efficient in this case, the loss of information seems to be too large.
- Both unreduced models, however, seems to perform quite well. As explained in both cases, we could benefit from a larger sample of data but, from the informations we have, both models could be used, bag of words seems to be more accurate overall but it might depend on the training.
- It is important to remember that the number of descriptions directly affects the length of the vectors generated by these approaches. As the number of descriptions increases, it is highly likely that the size of the matrices (which are linked to the size of the corpus' vocabulary) will grow, potentially negatively impacting the model's performance in terms of speed and memory usage.
<br><br><hr><br>
- Using word and sentence embedding might lead to better performances and better results.

# 2 : Word embedding :

&emsp;In the context of text classification, word embedding can be a pretty potent method over BoW and TF-IDF methods :
- Word embedding captures the semantic meaning of a word (if the model is well-trained), which, in the context of product classification, might present some advantages.
- It also presents the advantage of being a "dense" method, compared to BoW and TF-IDF which are "sparse" methods. Our corpus is quite small, but we can see that the BoW and TF-IDF vectors are very long (2838), but still manageable. If we want to upscale these methods, it would mean that these vectors would be as long as there are unique terms in the dataset, which would present a big computational and size issue. As word embedding represents a word in a vector of size (usually) 300, upscaling wouldn't be such of a problem.

<i>In a context where the dataset would be a much bigger sample, we could create our own word embedding model. However, since we have only 1048 product descriptions, creating a model based on this data might not be relevant enough, so we will use a pretrained model instead. It might be more accurate with more data or with a training dataset using only e-commerce data.</i>

## 2.1 : Selection of the model

&emsp;We have a lot of options (Google's word2vec on Google News, Stanford's GloVe on Wikipedia and Meta's Fasttext). These models are quite heavy (around 1Gb) but contain a lot of informations. Google's model is the largest but Fasttest can also be interesting to use as it is trained on Wikipedia's corpus.