In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [2]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

Атрибут traget хранит номера категорий для текстов из обучающей выборки:

In [3]:
newsgroups_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

Доступ к самим текстам через атрибут data. Выведем текст и категорию случайного примера из обучающего датасета:

In [4]:
n = 854
print('Topic = {0}\n'.format(newsgroups_train.target_names[newsgroups_train.target[n]]))
print(newsgroups_train.data[n])

Topic = rec.motorcycles

hey... I'm pretty new to the wonderful world of motorcycles... I just
bought
a used 81 Kaw KZ650 CSR from a friend.... I was just wondering what kind of

saddle bags I could get for it (since I know nothing about them)  are there
bags for the gas tank?  how much would some cost, and how much do they
hold?
thanks for your advice!!!  I may be new to riding, but I love it
already!!!!
:)




In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'or', 'even', 'onto', 'several', 'some', 'throughout', 'same', 'thereby', 'yourself', 'them', 'whole', 'how', 'would', 'mill', 'cannot', 'not', 'fifteen', 'hereby', 'nowhere', 'before', 'amount', 'what', 'an', 'might', 'they', 'that', 'co', 'yet', 'anywhere', 'for', 'ever', 'sy...'has', 'other', 'rather', 'seeming', 'seem', 'done', 'twenty', 'seemed', 'most', 'take', 'herself'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
target_names_len = len(newsgroups_train.target_names)

In [7]:
target_names_len

20

In [8]:
def weighted_rand(weights): # generate random weighted sample 
    
    weights_normed = np.sort(weights) / np.sum(weights)
    weights_bounded = np.cumsum(weights_normed)

    rand = np.random.rand() # random sample from a uniform distribution over [0, 1)
    for i in range(len(weights)):
        if(rand < weights_bounded[i]):
            rand = i
            break
    return np.argsort(weights)[rand]

In [9]:
weights = np.array([1,1,1,1,1,1,10])

In [10]:
print(weighted_rand(weights))

6


In [11]:
word_to_topic = np.zeros(len(vectorizer.vocabulary_), dtype = int)

In [12]:
for i in range(len(vectorizer.vocabulary_)):       
    word_to_topic[i] = weighted_rand(np.full(target_names_len, 1/target_names_len))


In [13]:
num_topic = np.zeros(len(newsgroups_train.target_names))           # Счетчик n_k
num_topic_word = np.zeros((len(newsgroups_train.target_names), len(vectorizer.vocabulary_)))   # Счетчик n_k,w
num_text_topic = np.zeros((len(newsgroups_train.data), len(newsgroups_train.target_names)))    # Счетчик n_d,k
alpha = np.zeros(len(newsgroups_train.target_names))               # Распределение тем по текстам
beta = np.zeros((len(newsgroups_train.target_names), len(vectorizer.vocabulary_)))  # Распределение тем по словам


In [14]:
for i in range(len(newsgroups_train.data)):
    alpha[newsgroups_train.target[i]] = alpha[newsgroups_train.target[i]] + 1
    text = newsgroups_train.data[i]
    beta[newsgroups_train.target[i]] = beta[newsgroups_train.target[i]] + vectorizer.transform([text])
    x = np.resize(vectorizer.transform([text]).toarray(), len(vectorizer.vocabulary_))
    b = np.argwhere(x)
    c = word_to_topic[b]
    for j in range(len(num_topic)):
        num_text_topic[i, j] = len(c[(c == j)])
        num_topic[j] = num_topic[j] + len(c[(c == j)])
    text_transformed = vectorizer.inverse_transform(vectorizer.transform([text]))[0]
    for j in range(len(text_transformed)):
        word = vectorizer.vocabulary_.get(text_transformed[j])
        num_topic_word[word_to_topic[word], word] = num_topic_word[word_to_topic[word], word] + 1

In [15]:
      # По-хорошему, для хорошего результата, надо сделать несколько раз, но компьютер просто не тянул  
for i in range(len(newsgroups_train.data)):
    text = newsgroups_train.data[i]
    text_transformed = vectorizer.inverse_transform(vectorizer.transform([text]))[0]
    for j in range(len(text_transformed)):            
        word = vectorizer.vocabulary_.get(text_transformed[j])    
        topic = word_to_topic[word]
        num_text_topic[i, topic] = num_text_topic[i, topic] - 1
        num_topic[topic] = num_topic[topic] - 1
        num_topic_word[topic, word] = num_topic_word[topic, word] - 1

        p = np.zeros(len(num_topic))
        for k in range(len(num_topic)):
            p[k] = (num_text_topic[i, k] + alpha[k]) * (num_topic_word[k, word] + beta[k, word]) / (num_topic[k] + np.sum(beta[k]))
        topic = weighted_rand(np.abs(p))
        word_to_topic[word] = topic
        num_text_topic[i, topic] = num_text_topic[i, topic] + 1
        num_topic[topic] = num_topic[topic] + 1
        num_topic_word[topic, word] = num_topic_word[topic, word] + 1

Выпишем топ-10 слов по каждому тегу

In [16]:
inverse_dict = {v:k  for k,v in vectorizer.vocabulary_.items()}
for i in range(len(newsgroups_train.target_names)):
    #print('    Top 10 words in the Topic = {0}'.format(newsgroups_train.target_names[i]))
    print('    Top 10 words in the Topic = '+str(i+1))
    print()
    x = np.argsort(num_topic_word[i]) [word_to_topic[np.argsort(num_topic_word[i])] == i] [:-11:-1]
    for j in range(len(x)):
        print(inverse_dict.get(x[j]), end = ' ')
    print()
    print('\n\n')

    Top 10 words in the Topic = 1

like know number run 12 original simple wanted april lines 



    Top 10 words in the Topic = 2

maybe unless home hardware friend bible personal thinking related drivers 



    Top 10 words in the Topic = 3

think want public version white press useful 26 prove member 



    Top 10 words in the Topic = 4

people day drive nice 24 ok win near parts gave 



    Top 10 words in the Topic = 5

using power place list computer making 18 future happen willing 



    Top 10 words in the Topic = 6

god quite example times given article money later similar source 



    Top 10 words in the Topic = 7

way new right better case 10 having line cause gets 



    Top 10 words in the Topic = 8

did true second heard software left law came 11 message 



    Top 10 words in the Topic = 9

ll doesn probably possible called send change hand mind check 



    Top 10 words in the Topic = 10

going long kind makes says word speed sorry access went 



    Top 10 w