In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import spacy
import re
import time
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from collections import Counter
from umap import UMAP
from pandarallel import pandarallel

sys.path.append("../")

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

pandarallel.initialize(progress_bar=True)


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# NLP object creation :

nlp = spacy.load("en_core_web_lg")

# nlp._config  # Checking : tok2vec is in config by default.


# Feature engineering :

- Bag of words w/ sklearn.CountVectorizer
- TF-IDF
- Dimensional reduction of CV and Tf-Idf with UMAP reducer


In [3]:
# Reading cleaned dataset, using pickle allows type preservation (spacy doc, np array etc.)

df = pd.read_pickle(filepath_or_buffer="../data/ecommerce_cleaned.pkl")


In [4]:
df.head(n=1)


Unnamed: 0,product_name,doc_desc,lem_desc,category_tree,product_specifications,image,description
0,Elegance Polyester Multicolor Abstract Eyelet ...,"(key, elegance, polyester, multicolor, abstrac...","[key, elegance, polyester, multicolor, abstrac...","[Home Furnishing, Curtains & Accessories, Curt...","{""product_specification""=>[{""key""=>""Brand"", ""v...",55b85ea15a1536d46b7190ad6fff8ce7.jpg,Key Features of Elegance Polyester Multicolor ...


# Feature exctraction :
&emsp;CountVectorizer & TfidfVectorizer on the whole corpus first

In [5]:
# Stopwords have already been cleaned so we dont need to add them to the vectorizer

count_vectorizer = CountVectorizer(tokenizer=nlp, max_df=0.95, min_df=1)
tf_idf_vectorizer = TfidfVectorizer(tokenizer=nlp, max_df=0.95, min_df=1)


In [6]:
def get_vector(row):
    """
    returns the vector of the spacy.doc object in col doc_desc
    """
    desc_doc = row["doc_desc"]
    desc_vec = desc_doc.vector
    return desc_vec


def list_to_str(row):
    """
    Turns lem_desc, a list of tokens, into a string for CV and TF-IDF usage,
    returns string
    """
    desc_list = row["lem_desc"]
    desc_txt = " ".join(desc_list)
    return desc_txt


In [7]:
df["text_vec"] = df.apply(func=get_vector, axis=1)
df["lem_desc_txt"] = df.apply(func=list_to_str, axis=1)


In [8]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    count_vectorizer_transform = count_vectorizer.fit_transform(df["lem_desc_txt"])
    tf_idf_transform = tf_idf_vectorizer.fit_transform(df["lem_desc_txt"])


In [9]:
print(count_vectorizer_transform.shape)
print(tf_idf_transform.shape)


(1048, 39630)
(1048, 39630)


In [10]:
df.__len__()


1048

# UMAP on vectors from count_vectorizer and tf-idf :

- This allows to reduce the dimensions from 39630 to 2
- UMAP is globally faster than T-SNE for often better results

In [11]:
# Dimension reduction using UMAP, settings by default, 2 components

umap = UMAP(n_components=2)

umap_cv = umap.fit_transform(count_vectorizer_transform)
umap_tfidf = umap.fit_transform(tf_idf_transform)

print(umap_cv.shape)
print(umap_tfidf.shape)


(1048, 2)
(1048, 2)


### Assigning reduced components to df :

- saving component 0 and 1 for CountVectorizer (umap_cv) and tf-idf (umap_tfidf)
- assigning 2 cols for each method

In [12]:
df["umap_cv_comp_0"] = np.nan
df["umap_cv_comp_1"] = np.nan
df["umap_tfidf_comp_0"] = np.nan
df["umap_tfidf_comp_1"] = np.nan

for index in range(0, len(df)):
    df.loc[index, "umap_cv_comp_0"] = umap_cv[index][0]
    df.loc[index, "umap_cv_comp_1"] = umap_cv[index][1]
    df.loc[index, "umap_tfidf_comp_0"] = umap_tfidf[index][0]
    df.loc[index, "umap_tfidf_comp_1"] = umap_tfidf[index][1]


In [13]:
# Display original text next to reduced components for cv and tf-idf :

display_cols = [
    "lem_desc_txt", "umap_cv_comp_0", "umap_cv_comp_1",
    "umap_tfidf_comp_0", "umap_tfidf_comp_1"
    ]

display(df[display_cols])


Unnamed: 0,lem_desc_txt,umap_cv_comp_0,umap_cv_comp_1,umap_tfidf_comp_0,umap_tfidf_comp_1
0,key elegance polyester multicolor abstract eye...,13.339216,5.509308,-2.302987,-2.012600
1,sathiyas cotton bath towel bath towel red yell...,12.946025,5.479076,-0.920405,1.489719
2,key santosh royal fashion cotton print king si...,10.251674,2.380722,-1.941139,-1.737872
3,key jaipur print cotton floral king size doubl...,12.310413,5.087021,-5.135172,5.910101
4,maserati time analog watch boy maserati time a...,10.461782,2.053916,-4.985846,0.771593
...,...,...,...,...,...
1043,empower extra large self adhesive sticker pack...,12.538563,4.357142,1.536660,1.411471
1044,wallmantra large vinyl sticker sticker pack br...,12.424451,4.302386,-3.849431,-0.466578
1045,uberlyfe extra large pigmented polyvinyl film ...,11.927685,4.421601,-4.167434,-0.598175
1046,wallmantra medium vinyl sticker sticker wallma...,12.306740,4.287863,-4.104576,-0.592719
