In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import collections

In [2]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
import spacy

In [6]:
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [8]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [10]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [11]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [12]:
#tokenize

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [13]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [14]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [15]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [16]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [17]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [18]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [19]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [20]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

In [22]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [23]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

In [24]:
# Create Corpus
texts = data_lemmatized

In [25]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [32]:
def bestcoherence(topics: list):
    coherences = []
    for topic in topics:
        print(f'У топика номер ={topic}')
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=topic, 
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)
        coherence_model_lda = CoherenceModel(model=lda_model, 
                                             texts=data_lemmatized, 
                                             dictionary=id2word, 
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print(f' такой результат {coherence_lda}')
        print()
        coherences.append((topic, coherence_lda))
        
    coherences = sorted(coherences)
    print(f'Лучший результат: {coherences[0][1]} with {coherences[0][0]} topics')
    return coherences

In [33]:
topics = [10, 15, 17, 19, 20, 23, 25, 27, 30]

best_number_of_topics = bestcoherence(topics)

У топика номер =10
 такой результат 0.4973497938960755

У топика номер =15
 такой результат 0.44582805103485196

У топика номер =17
 такой результат 0.4368353826625469

У топика номер =19
 такой результат 0.4722627871932223

У топика номер =20
 такой результат 0.4392813747423439

У топика номер =23
 такой результат 0.45151831379157337

У топика номер =25
 такой результат 0.431570330596339

У топика номер =27
 такой результат 0.4455289540437199

У топика номер =30
 такой результат 0.45269036287404113

Лучший результат: 0.4973497938960755 with 10 topics


In [34]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=10, 
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)

In [37]:
topic_counter = collections.Counter()
words = []
topics = lda_model.print_topics()
topic_words = {}
dic_for_df = []
for text in texts:
  for i in text:
    for topic in topics:
      topic_words = {}
      for elements in topic: 
        if elements is int:
          id_topic = elements
        if type(elements) is not int:
          weights_words = re.findall(r'([0-9]\.[0-9]*)\*\"(\w*)\"', str(elements))
          for j in weights_words:
            topic_words.update({j[1]:float(j[0])})
      if i in topic_words.keys():
        weight = topic_words.setdefault(i)
        topic_counter[topic[0]] += weight
        most = topic_counter.most_common(1)[0][0]
  dic_for_df.append({'text': text, 'topic': most})

In [40]:
topics_texts_df = pd.DataFrame(dic_for_df).fillna('')
topics_texts_df

Unnamed: 0,text,topic
0,"[where, thing, car, nntp_poste, host, park, li...",3
1,"[poll, final, call, summary, final, call, cloc...",3
2,"[engineering, computer, network, distribution_...",6
3,"[division, line, host, write, write, article, ...",7
4,"[question, distribution, article, write, clear...",7
...,...,...
11309,"[scan, city, reply, line, consultation, cheap,...",7
11310,"[screen, medford, old, problem, screen, blank,...",7
11311,"[este, mount, mail, group, line, instal, try, ...",7
11312,"[line, article, write, boy, embarasse, trivial...",7


In [56]:
unique_topics = []
for i in topics_texts_df['topic'].unique():
  unique_topics.append(i)

In [61]:
print(unique_topics)

[3, 6, 7, 4]


In [136]:
dic_for_df2 = []

In [138]:
def findtdidf(topic):
  texts_count = len(topics_texts_df[topics_texts_df['topic']==topic]['text'])
  for text in topics_texts_df[topics_texts_df['topic']==topic]['text'].values:
    tfidfList = []
    tfidfDict = {}
    text_len = len(text)
    for word in text:
      word_count = len(re.findall(r'{}'.format(word), str(text)))
      tf = word_count / text_len
      texts_with_word_count = collections.Counter()
      for text in topics_texts_df[topics_texts_df['topic']==topic]['text'].values:
        if word in text:
          texts_with_word_count[word] += 1
      idf = texts_count / texts_with_word_count[word]
      tf_idf = tf * idf
      tfidfDict.update({word: tf_idf})
    newdict = sorted(tfidfDict, key=tfidfDict.get, reverse=True)
    bestwords = list(newdict)[:5]
  dic_for_df2.append({'topics': topic, 'texts': text, 'tf-idf_best-words': bestwords})
  return dic_for_df2

In [139]:
onedf = findtdidf(3)

In [140]:
newdf = pd.DataFrame(onedf).fillna('')

In [None]:
newdf.append(twodf)

In [142]:
newdf

Unnamed: 0,topics,texts,tf-idf_best-words
0,3,"[poll, final, call, summary, final, call, cloc...","[poll, clock, final, upgrade, experience]"


Работа coherence score:
- проходит по частотным словам в топике
- считается частота появления двух слов из текстов рядом друг с другом
- эта частота используются для подсчета NPMI для пар высокочастотных слов, отсюда получаются векторы слов
- потом по самым вероятным словам подсчитывается косинусная близость между их векторами и сумма всех этих векторов
- coherence - среднее значение всех косинусных близостей.