## Loading Necessary Libraries

In [12]:
# general
import re
import gc
import os
import csv
import time
import math
import pickle
from tqdm import tqdm

# data handling
import numpy as np
import pandas as pd

# HTML parsing
from bs4 import BeautifulSoup

# plotting
import matplotlib.pyplot as plt

# chunker
import torch
from torch import cuda

# NLP
import nltk.corpus
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from bertopic import BERTopic

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rishideychowdhury/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rishideychowdhury/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data

In [2]:
# 2023
# data = pd.read_csv('../data/2023/train-v3.csv', usecols=['BULLET_POINTS'])
data = pd.read_csv('../data/2023/train-v3.csv', usecols=['PRODUCT_TYPE_ID', 'DESCRIPTION'])

In [3]:
data = data.fillna('')

In [5]:
# data = data.to_numpy().tolist()

In [6]:
# data = pd.DataFrame([' . '.join([str(bp) for bp in bps if str(bp) != '']) for bps in data])

In [7]:
# data

In [7]:
data = data[~data.iloc[:,0].isin([''])]

In [8]:
# data_bps_idxs = data.index.to_list()

In [9]:
data

Unnamed: 0,DESCRIPTION,PRODUCT_TYPE_ID
2,"Specifications : Color : Red , Material : Alum...",7537
3,AISHAH Women ' s Lycra Cotton Ankel Leggings ....,2996
5,HINS Brings you the most Elegant Looking Pot w...,5725
7,Aluminum Foil Stickers-good kitchen helper for...,6030
9,"Transform your home , workplace or hotel room ...",8201
...,...,...
2249688,Welcome to the wonderfully Wicked World of Aut...,123
2249689,This extra long Tall t-Shirt will be your favo...,2879
2249694,[ Brand ] : XVIEONR [ Product name ] : Fashion...,3413
2249695,Wall Clocks Are Very Attractive In Looks And E...,1574


## Cleaning

In [13]:
# define function to clean text

stop = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

def clean_text(text):

  text = text.lower()
  text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text) # Punctuations, URLs and @
  text = " ".join([word for word in text.split() if word not in (stop)]) # Stopwords
  text = " ".join([lemmatizer.lemmatize(word) for word in text.split()]) # Stemming
  
  return(text)

In [15]:
data_prod = data[data.PRODUCT_TYPE_ID == 1650]
data_prod

Unnamed: 0,DESCRIPTION,PRODUCT_TYPE_ID
491,STORE99 Acrylic Crystal Butterfly Beads Curtai...,1650
618,WHY COZY FURNISH CURTAINS : Our premium cotton...,1650
1152,3NH Hotel Decoration Window Thickened Yarn Cor...,1650
1375,This curtain enhances the look of the interior...,1650
2437,"Curtains : Set of 3 Long Door Curtains , Mater...",1650
...,...,...
2246662,Don ’ t let the sunlight wake you up when you ...,1650
2246765,GlobalNiche Modern Velvet Solid Blackout Curta...,1650
2247261,Gdr Curtains Add A Touch Of Sophistication To ...,1650
2247730,Bring home happiness with finest quality of Cu...,1650


In [16]:
data_sub = data_prod[['DESCRIPTION']].copy(deep = True)
data_sub = data_sub.applymap(clean_text)
data_sub
     

Unnamed: 0,DESCRIPTION
491,store99 acrylic crystal butterfly bead curtain...
618,cozy furnish curtain premium cotton linen look...
1152,3nh hotel decoration window thickened yarn cor...
1375,curtain enhances look interior curtain made 10...
2437,curtain set 3 long door curtain material crush...
...,...
2246662,let sunlight wake want liein instead look blac...
2246765,globalniche modern velvet solid blackout curta...
2247261,gdr curtain add touch sophistication living be...
2247730,bring home happiness finest quality curtain so...


## Topic Modelling

In [None]:
docs_product_desc = data_sub.DESCRIPTION
nlp = spacy.load('en_core_web_sm', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

In [20]:
topic_model_product_desc = BERTopic(embedding_model=nlp, nr_topics=10)
topics, probs = topic_model_product_desc.fit_transform(docs_product_desc)

In [21]:
fig = topic_model_product_desc.visualize_topics()
fig.show()

In [29]:
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs_product_desc.to_list())

Batches:   0%|          | 0/111 [00:00<?, ?it/s]

2023-05-29 04:45:12,382 - BERTopic - Transformed documents to Embeddings
2023-05-29 04:45:20,066 - BERTopic - Reduced dimensionality
2023-05-29 04:45:21,401 - BERTopic - Clustered reduced embeddings


In [30]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,589,-1_curtain_window_size_room
1,0,119,0_market_enhances_engaged_organization
2,1,89,1_globalniche_tulle_cortina_blind
3,2,88,2_outdoor_blackout_energy_triple
4,3,69,3_heat_liein_999_blackout


In [31]:
topic_model.get_topic(0)  # Select the most frequent topic

[('market', 0.08032821562935667),
 ('enhances', 0.0603748181441646),
 ('engaged', 0.059099876078665306),
 ('organization', 0.059099876078665306),
 ('loving', 0.0588552926460896),
 ('demand', 0.05873422936075664),
 ('rate', 0.05794783041124989),
 ('competitive', 0.05723016885592878),
 ('reputed', 0.05723016885592878),
 ('romantic', 0.055236848594538354)]

In [32]:
topic_model.topics_[:10]

[1, 27, 1, 0, 5, 13, 21, 8, -1, 98]

In [33]:
topic_model.visualize_topics()

In [34]:
topic_model.get_document_info(docs_product_desc)

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,store99 acrylic crystal butterfly bead curtain...,1,1_globalniche_tulle_cortina_blind,globalniche - tulle - cortina - blind - drape ...,0.032718,False
1,cozy furnish curtain premium cotton linen look...,27,27_52_135_inch_cm,52 - 135 - inch - cm - width - length - 63inch...,0.026296,False
2,3nh hotel decoration window thickened yarn cor...,1,1_globalniche_tulle_cortina_blind,globalniche - tulle - cortina - blind - drape ...,0.255184,False
3,curtain enhances look interior curtain made 10...,0,0_market_enhances_engaged_organization,market - enhances - engaged - organization - l...,0.385787,True
4,curtain set 3 long door curtain material crush...,5,5_foot_shrink_fastness_caters,foot - shrink - fastness - caters - wash - soa...,0.021353,False
...,...,...,...,...,...,...
3534,let sunlight wake want liein instead look blac...,3,3_heat_liein_999_blackout,heat - liein - 999 - blackout - instead - wake...,1.000000,True
3535,globalniche modern velvet solid blackout curta...,1,1_globalniche_tulle_cortina_blind,globalniche - tulle - cortina - blind - drape ...,0.119192,False
3536,gdr curtain add touch sophistication living be...,111,111_gdr_foot_stitching_door,gdr - foot - stitching - door - making - clean...,1.000000,False
3537,bring home happiness finest quality curtain so...,19,19_happiness_soulful_intersection_affordability,happiness - soulful - intersection - affordabi...,1.000000,True


In [35]:
topic_model.get_document_info(docs_product_desc).iloc[0,0]

'store99 acrylic crystal butterfly bead curtain diy window door curtain party wedding crystal bead curtain passage decoration zh01572'

In [37]:
topic_model.get_document_info(docs_product_desc).iloc[0,3]

'globalniche - tulle - cortina - blind - drape - sheer - voile - purple - bedroom - green'