In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import spacy
import re
import time
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from collections import Counter
from umap import UMAP
from pandarallel import pandarallel

sys.path.append("../")

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

pandarallel.initialize(progress_bar=True)


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# NLP object creation :

nlp = spacy.load("en_core_web_lg")

# nlp._config  # Checking : tok2vec is in config by default.


Vectorization Feature Engineering (TF-IDF)


In [3]:
df = pd.read_pickle(filepath_or_buffer="../data/ecommerce_cleaned.pkl")


In [4]:
df.head(n=1)


Unnamed: 0,product_name,doc_desc,lem_desc,category_tree,product_specifications,image,description
0,Elegance Polyester Multicolor Abstract Eyelet ...,"(key, elegance, polyester, multicolor, abstrac...","[key, elegance, polyester, multicolor, abstrac...","[Home Furnishing, Curtains & Accessories, Curt...","{""product_specification""=>[{""key""=>""Brand"", ""v...",55b85ea15a1536d46b7190ad6fff8ce7.jpg,Key Features of Elegance Polyester Multicolor ...


In [5]:
# Stopwords have already been cleaned so we dont need to add them to the vectorizer

bag_of_words_vectorizer = CountVectorizer(tokenizer=nlp, max_df=0.95, min_df=1)
tf_idf_vectorizer = TfidfVectorizer(tokenizer=nlp, max_df=0.95, min_df=1)


In [6]:
def get_vector(row):
    tweet_doc = row["doc_desc"]
    tweet_vec = tweet_doc.vector
    return tweet_vec


def list_to_str(row):
    desc_list = row["lem_desc"]
    desc_txt = " ".join(desc_list)
    return desc_txt


In [7]:
df["text_vec"] = df.apply(func=get_vector, axis=1)
df["lem_desc_txt"] = df.apply(func=list_to_str, axis=1)


In [8]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    bag_of_words_vectorizer.fit(df["lem_desc_txt"])
    tf_idf_vectorizer.fit(df["lem_desc_txt"])


In [9]:
count_vectorizer_transform = bag_of_words_vectorizer.transform(df["lem_desc_txt"])
tf_idf_transform = tf_idf_vectorizer.transform(df["lem_desc_txt"])


In [10]:
print(count_vectorizer_transform.shape)
print(tf_idf_transform.shape)


(1048, 39630)
(1048, 39630)


In [11]:
df.__len__()


1048

In [12]:
# Dimension reduction using UMAP, settings by default, 2 components

umap = UMAP(n_components=2)

umap_cv = umap.fit_transform(count_vectorizer_transform)
umap_tfidf = umap.fit_transform(tf_idf_transform)

print(umap_cv.shape)
print(umap_tfidf.shape)


(1048, 2)
(1048, 2)


In [13]:
df["umap_cv_comp_0"] = np.nan
df["umap_cv_comp_1"] = np.nan
df["umap_tfidf_comp_0"] = np.nan
df["umap_tfidf_comp_1"] = np.nan

for index in range(0, len(df)):
    df.loc[index, "umap_cv_comp_0"] = umap_cv[index][0]
    df.loc[index, "umap_cv_comp_1"] = umap_cv[index][1]
    df.loc[index, "umap_tfidf_comp_0"] = umap_tfidf[index][0]
    df.loc[index, "umap_tfidf_comp_1"] = umap_tfidf[index][1]


In [14]:
display_cols = [
    "lem_desc_txt", "umap_cv_comp_0", "umap_cv_comp_1",
    "umap_tfidf_comp_0", "umap_tfidf_comp_1"
    ]

display(df[display_cols])


Unnamed: 0,lem_desc_txt,umap_cv_comp_0,umap_cv_comp_1,umap_tfidf_comp_0,umap_tfidf_comp_1
0,key elegance polyester multicolor abstract eye...,-0.482895,4.855195,6.682447,5.726218
1,sathiyas cotton bath towel bath towel red yell...,-1.744056,1.868726,7.616164,5.646738
2,key santosh royal fashion cotton print king si...,-1.068458,4.428407,7.672042,5.513602
3,key jaipur print cotton floral king size doubl...,-0.992517,0.969484,7.109655,5.864190
4,maserati time analog watch boy maserati time a...,-0.198330,5.105917,4.593555,5.320143
...,...,...,...,...,...
1043,empower extra large self adhesive sticker pack...,0.233520,4.172408,7.123092,4.136333
1044,wallmantra large vinyl sticker sticker pack br...,0.528329,3.956614,6.909503,4.222368
1045,uberlyfe extra large pigmented polyvinyl film ...,0.480174,3.816401,7.292850,3.933069
1046,wallmantra medium vinyl sticker sticker wallma...,0.500884,4.264706,6.713336,3.836522
