In [2]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [4]:
# https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification
data = pd.read_csv('ecommerceDataset.csv')
data = data.rename(columns={data.columns[0]: 'raw_character_text', data.columns[1]: 'spoken_words'})
data

Unnamed: 0,raw_character_text,spoken_words
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
...,...,...
50419,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50420,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50421,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50422,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


In [48]:
# удаляем пустые значения
data = data.dropna().reset_index(drop=True)
data.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [49]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [50]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in data['spoken_words'])

In [51]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=4)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 7.29 mins


In [52]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(27160, 1)

In [53]:
from gensim.models.phrases import Phrases, Phraser

In [54]:
sent = [row.split() for row in df_clean['clean']]

In [55]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 16:46:46: collecting all words and their counts
INFO - 16:46:46: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 16:46:47: PROGRESS: at sentence #10000, processed 683487 words and 312227 word types
INFO - 16:46:47: PROGRESS: at sentence #20000, processed 1368338 words and 674231 word types
INFO - 16:46:48: collected 873513 token types (unigram + bigrams) from a corpus of 1935222 words and 27160 sentences
INFO - 16:46:48: merged Phrases<873513 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 16:46:48: Phrases lifecycle event {'msg': 'built Phrases<873513 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 1.42s', 'datetime': '2023-05-14T16:46:48.201147', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [56]:
bigram = Phraser(phrases)

INFO - 16:46:50: exporting phrases from Phrases<873513 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 16:46:51: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<1866 phrases, min_count=30, threshold=10.0> from Phrases<873513 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.81s', 'datetime': '2023-05-14T16:46:51.556156', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [57]:
sentences = bigram[sent]

In [58]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

57820

In [59]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['design', 'book', 'product', 'size', 'set', 'use', 's', 'color', 'x', 'black']

In [60]:
import multiprocessing

from gensim.models import Word2Vec

In [61]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [62]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

INFO - 16:47:05: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-05-14T16:47:05.743213', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [63]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:47:10: collecting all words and their counts
INFO - 16:47:10: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:47:11: PROGRESS: at sentence #10000, processed 627309 words, keeping 24107 word types
INFO - 16:47:11: PROGRESS: at sentence #20000, processed 1272809 words, keeping 49385 word types
INFO - 16:47:11: collected 57820 word types from a corpus of 1791351 raw words and 27160 sentences
INFO - 16:47:11: Creating a fresh vocabulary
INFO - 16:47:11: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 9604 unique words (16.61% of original 57820, drops 48216)', 'datetime': '2023-05-14T16:47:11.790860', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
INFO - 16:47:11: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 1637242 word corpus (91.40% of original 1791351, drops 154109)', 'datetime': '20

Time to build vocab: 0.02 mins


In [64]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:47:14: Word2Vec lifecycle event {'msg': 'training model with 15 workers on 9604 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-05-14T16:47:14.051076', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
INFO - 16:47:15: EPOCH 0 - PROGRESS: at 43.33% examples, 422814 words/s, in_qsize 28, out_qsize 20
INFO - 16:47:15: EPOCH 0: training on 1791351 raw words (1050558 effective words) took 1.7s, 629307 effective words/s
INFO - 16:47:16: EPOCH 1 - PROGRESS: at 49.04% examples, 499947 words/s, in_qsize 30, out_qsize 1
INFO - 16:47:17: EPOCH 1: training on 1791351 raw words (1050513 effective words) took 2.0s, 535580 effective words/s
INFO - 16:47:18: EPOCH 2 - PROGRESS: at 29.33% examples, 283815 words/s, in_qsize 27, out_qsize 3
INFO - 16:47:19: EPOCH 2 - PROGRESS: at 61.27% examples, 330317 

Time to train the model: 1.4 mins


In [81]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [82]:
# топ-N наиболее похожих ключей.
w2v_model.wv.most_similar(positive=["color"])

[('colour', 0.7377306222915649),
 ('white', 0.46367552876472473),
 ('size', 0.44954991340637207),
 ('dark_brown', 0.4471963942050934),
 ('coloured', 0.4401470422744751),
 ('sky_blue', 0.4386478066444397),
 ('x_inch', 0.4343136250972748),
 ('yellow', 0.425954133272171),
 ('cmx', 0.42519354820251465),
 ('orchid', 0.41912955045700073)]

In [83]:
# топ-N наиболее похожих ключей.
w2v_model.wv.most_similar(positive=["product"])

[('item', 0.4581340551376343),
 ('reasonable_price', 0.43712934851646423),
 ('customer', 0.40329709649086),
 ('pricing', 0.39780914783477783),
 ('quality', 0.3960415720939636),
 ('brand', 0.3846954107284546),
 ('customer_service', 0.37859463691711426),
 ('raw_material', 0.36219289898872375),
 ('place_order', 0.35821089148521423),
 ('warranty_manufacturing', 0.349313884973526)]

In [84]:
# Вычислите косинусное сходство между двумя ключами.
w2v_model.wv.similarity('customer_service', 'reasonable_price')

0.36100146

In [85]:
# Какой ключ из приведенного списка не подходит к остальным?
w2v_model.wv.doesnt_match(['dark_brown', 'yellow', 'place_order'])

'place_order'

In [86]:
# Какое слово относится к brand также, как x_inch к place_order
w2v_model.wv.most_similar(positive=["brand", "dark_brown"], negative=["orchid"], topn=3)

[('forzza', 0.31910401582717896),
 ('cum_bed', 0.28673428297042847),
 ('ttk_prestige', 0.28248220682144165)]