In [1]:
import pandas as pd
pd.options.display.max_columns=200
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.pipeline import make_pipeline
import bs4 as bs
import nltk
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from wordcloud import WordCloud
from PIL import Image
from pandarallel import pandarallel

In [2]:
data_bow = pd.read_csv('data_bow.csv')

In [3]:
data_bow.head()

Unnamed: 0,Id,Body,Title,Tags,text
0,40101130,consider pd.series import panda import numpy n...,calculate rolling idxmax,<python><pandas><numpy><dataframe><series>,calculate rolling idxmax consider pd.series...
1,662383,'ve using time substring str_col patindex str_...,better technique trimming leading zero sql server,<sql><sql-server><sql-server-2005><tsql><string>,better technique trimming leading zero sql ser...
2,662421,trying install java application linux machine ...,x11 display variable mean,<java><linux><variables><x11><headless>,x11 display variable mean trying install ja...
3,3520133,library allows easily conveniently create obje...,object oriented callback,<c++><oop><callback><pointer-to-member><eiffel>,object oriented callback library allows eas...
4,1396164,edit whole question unclear want use openssl.n...,why n't .net find openssl.net dll,<c#><.net><dll><dllimport><dllnotfoundexception>,why n't .net find openssl.net dll edit whol...


In [4]:
#Split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_bow[['Title','Body','text']], data_bow['Tags'], test_size=0.2, random_state=42)

In [5]:
# création du bag of words (CountVectorizer et Tf-idf)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cvect = CountVectorizer(stop_words='english', max_df=0.9, min_df=2)
ctf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=1)

feat = 'Title'
cv_fit = cvect.fit(X_train[feat])
ctf_fit = ctf.fit(X_train[feat])

cv_transform_title = cvect.transform(X_train['Title'])  
ctf_transform_title = ctf.transform(X_train['Title'])

cv_transform_text = cvect.transform(X_train['text'])
ctf_transform_text = ctf.transform(X_train['text'])

In [6]:
cv_transform_title.shape, ctf_transform_title.shape

((40000, 9073), (40000, 19476))

In [7]:
cv_transform_text.shape, ctf_transform_text.shape

((40000, 9073), (40000, 19476))

In [8]:
#type of cv_transform_title
type(cv_transform_title)

scipy.sparse._csr.csr_matrix

In [9]:
cv_fit.vocabulary_.get('python')

6209

In [10]:
#Classement des mots les plus fréquents

cv_sum = cv_transform_title.sum(axis=0)
cv_words_freq = [(word, cv_sum[0, idx]) for word, idx in cv_fit.vocabulary_.items()]
cv_words_freq =sorted(cv_words_freq, key = lambda x: x[1], reverse=True)
cv_words_freq[:10]

[('using', 3088),
 ('file', 2062),
 ('use', 1531),
 ('error', 1483),
 ('net', 1477),
 ('android', 1367),
 ('java', 1296),
 ('python', 1279),
 ('data', 1051),
 ('window', 987)]

In [11]:
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import glob


#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

2023-04-15 10:48:02.524924: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-15 10:48:02.640134: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-15 10:48:03.511400: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-15 10:48:03.584407: I tensorflow/comp

In [12]:
X_train['Title'][0][0:90]

'calculate rolling idxmax'

In [13]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words_titles = gen_words(X_train['Title'])

print (data_words_titles[0:5])

[['how', 'exactly', 'remove', 'punctuation', 'using', 'package'], ['how', 'debug', 'bat', 'script'], ['how', 'get', 'utc', 'time', 'midnight', 'given', 'timezone'], ['meteor', 'native', 'osx', 'window', 'app'], ['any', 'quick', 'win', 'make', 'net', 'remoting', 'faster', 'single', 'machine']]


In [14]:
id2word_title = corpora.Dictionary(data_words_titles)

corpus = []
for text in data_words_titles:
    new = id2word_title.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word_title[[0][:1][0]]
print (word)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]
exactly


In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word_title,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [17]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word_title, mds="mmds", R=30)
vis

In [18]:
data_words = gen_words(X_train['text'])
print (data_words[0][0:20])

['how', 'exactly', 'remove', 'punctuation', 'using', 'package', 'update', 'think', 'may', 'workaround', 'solve', 'problem', 'add', 'one', 'code', 'dtms', 'dtm', 'remove', 'sparse', 'character']


In [19]:
id2word_text = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word_text.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word_text[[0][:1][0]]
print(word)

[(0, 3), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 2), (7, 2), (8, 2), (9, 1), (10, 3), (11, 17), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 4), (19, 2)]
abap


In [20]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word_text,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [23]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word_text, mds="mmds", R=30)
vis

In [24]:
#Coherence score for LDA model
def compute_coherence_values(model, id2word):
      coherence_model_lda = CoherenceModel(model=model, texts=data_words, dictionary=id2word, coherence='c_v')
      return coherence_model_lda.get_coherence()

coherence_lda_text = compute_coherence_values(lda_model, id2word_text)

In [25]:
coherence_values = []
lda_models = []
for i in range(5, 20):
      lda_models[i] = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                             id2word=id2word_text,
                                             num_topics=i,
                                             random_state=100,
                                             update_every=1,
                                             chunksize=100,
                                             passes=10,
                                             alpha="auto")
      coherence_values.append(compute_coherence_values(lda_model, id2word_text))

      #save model
      lda_models[i].save('lda_model_{}.model'.format(i))

In [31]:
# print Coherence values
for m, cv in zip(range(5, 20), coherence_values):
      print("Num Topics =", m, "has Coherence Value of", round(cv, 4))

Num Topics = 10 has Coherence Value of 0.5846
Num Topics = 11 has Coherence Value of 0.5219
Num Topics = 12 has Coherence Value of 0.5298
Num Topics = 13 has Coherence Value of 0.5114
Num Topics = 14 has Coherence Value of 0.5429
Num Topics = 15 has Coherence Value of 0.5517
Num Topics = 16 has Coherence Value of 0.5166
Num Topics = 17 has Coherence Value of 0.5124
Num Topics = 18 has Coherence Value of 0.513
Num Topics = 19 has Coherence Value of 0.4861
Num Topics = 20 has Coherence Value of 0.4915
Num Topics = 21 has Coherence Value of 0.4727
Num Topics = 22 has Coherence Value of 0.4978
Num Topics = 23 has Coherence Value of 0.481
Num Topics = 24 has Coherence Value of 0.4501
Num Topics = 25 has Coherence Value of 0.4843
Num Topics = 26 has Coherence Value of 0.4619
Num Topics = 27 has Coherence Value of 0.4649
Num Topics = 28 has Coherence Value of 0.4463
Num Topics = 29 has Coherence Value of 0.4438


In [37]:
lda_model[corpus[-1]]

[(1, 0.07482338),
 (5, 0.03430545),
 (7, 0.034209862),
 (12, 0.014480438),
 (13, 0.42337108),
 (15, 0.08560521),
 (20, 0.018532423),
 (23, 0.08812632),
 (25, 0.12633176),
 (26, 0.031207575)]