In [None]:
import spacy
import re
from time import time
import pandas as pd

nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])

def clean(texts):
    text = [token.lemma_ for token in texts]
    if len(text)>2:
        return " ".join(text)

remove_symbols = (re.sub(r"[^a-zA-Z]+", " ", str(row)).lower() for row in df.question)

t = time()
clean_txt = [clean(text) for text in nlp.pipe(remove_symbols, batch_size=5000, n_process=-1)]
print(f"Time to preprocess the text : {round((time()-t)/60, 2)}")

df_clean = pd.DataFrame({"preprocessed_text": clean_txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
df_clean.head()

In [None]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from collections import defaultdict
import multiprocessing

sents = [sent.split() for sent in df_clean.preprocessed_text]
phrases = Phrases(sents, min_count=30, progress_per=10000)
phrasegrams = Phraser(phrases)
sentences = phrasegrams[sents]

word_freq = defaultdict(int)
for sent in sents:
    for token in sent:
        word_freq[token] += 1
print(f"Total words :", len(word_freq))

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

cores = multiprocessing.cpu_count()

word2vec = Word2Vec(
    min_count=20,
    window=10, 
    vector_size=300,
    alpha=0.03,
    min_alpha=0.0007,
    negative=20,
    workers=cores-1
)

t = time()
word2vec.build_vocab(sentences, progress_per=10000)
print(f"Time to built vocabulary : {round((time()-t)/60, 2)} min")

t = time()
word2vec.train(sentences, total_examples=word2vec.corpus_count, epochs=30, report_delay=1)
print(f"Time to train the model : {round((time()-t)/60, 2)} min.")