In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [3]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
newsgroups_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [5]:
n = 0
print('Topic = {0}\n'.format(newsgroups_train.target_names[newsgroups_train.target[n]]))
print(newsgroups_train.data[n])

Topic = rec.autos

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df=14, max_df=.04)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.04, max_features=None, min_df=14,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [7]:
len(vectorizer.vocabulary_)

7891

In [8]:
vectorizer.vocabulary_

{'wondering': 7794,
 'car': 1398,
 'saw': 6302,
 'door': 2473,
 'sports': 6751,
 'looked': 4353,
 'late': 4169,
 'early': 2555,
 'doors': 2474,
 'small': 6621,
 'addition': 563,
 'bumper': 1312,
 'separate': 6430,
 'rest': 6076,
 'body': 1186,
 'model': 4689,
 'engine': 2686,
 'specs': 6719,
 'production': 5617,
 'history': 3529,
 'info': 3788,
 'fair': 2914,
 'brave': 1249,
 'souls': 6679,
 'upgraded': 7478,
 'si': 6533,
 'clock': 1617,
 'oscillator': 5099,
 'shared': 6484,
 'experiences': 2842,
 'poll': 5440,
 'brief': 1262,
 'message': 4583,
 'procedure': 5600,
 'speed': 6725,
 'cpu': 2017,
 'rated': 5828,
 'add': 560,
 'cards': 1402,
 'adapters': 557,
 'heat': 3480,
 'hour': 3586,
 'usage': 7489,
 'floppy': 3066,
 'disk': 2410,
 'functionality': 3183,
 '800': 418,
 'floppies': 3065,
 'especially': 2738,
 'requested': 6033,
 'days': 2137,
 'network': 4873,
 'knowledge': 4121,
 'base': 1038,
 'upgrade': 7477,
 'haven': 3460,
 'answered': 749,
 'folks': 3080,
 'mac': 4405,
 'plus': 54

In [9]:
text = 'I was wondering if anyone out there could enlighten me on this car I saw'
x = vectorizer.transform([text])
type(x)

scipy.sparse.csr.csr_matrix

In [10]:
from tqdm import tqdm

def lda(X, numtop, a, b, n_iter=10):
    n_key = np.zeros((numtop, X.shape[1]))
    n_dk = np.zeros((X.shape[0], numtop))
    n_k = np.zeros(numtop)
    
    docs, words = X.nonzero()
    z = np.random.choice(numtop, len(docs))
    
    for doc, word, cur_z in zip(docs, words, z):
        n_dk[doc, cur_z] += 1
        n_key[cur_z, word] += 1
        n_k[cur_z] += 1
    
    for cur_iter in tqdm(range(n_iter)):
        for i in range(len(docs)):
            cur_word = words[i]
            cur_doc = docs[i]
            cur_topic = z[i]
            
            n_dk[cur_doc, cur_topic] -= 1
            n_key[cur_topic, cur_word] -= 1
            n_k[cur_topic] -= 1
            
            p = (n_dk[cur_doc, :] + a) * (n_key[:, cur_word] + b[cur_word]) / \
                (n_k + b.sum())
            z[i] = np.random.choice(np.arange(numtop), p=p / p.sum())
            
            n_dk[cur_doc, z[i]] += 1
            n_key[z[i], cur_word] += 1
            n_k[z[i]] += 1
    
    return z, n_key, n_dk, n_k

In [11]:
vectorizer.inverse_transform(x)

[array(['car', 'saw', 'wondering'], dtype='<U79')]

In [12]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 7891)

In [13]:
X_train.nnz / np.prod(X_train.shape)

0.0050781835332998635

In [14]:
numtop = 20
z, n_key, n_dk, n_k = lda(X_train,numtop, 1 * np.ones(numtop), 1 * np.ones(X_train.shape[1]), 60)

100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [31:38<00:00, 27.95s/it]


In [15]:
top_words = np.argsort(n_key, axis=1)[:, :-11:-1]

for topic in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for word in top_words[topic]:
        doc[0, word] = 1
    print('Topic {}:\t{}'.format(topic, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic 0:	bible	christ	christian	christians	church	faith	jesus	man	religion	word
Topic 1:	banks	chastity	geb	gordon	intellect	pitt	shameful	skepticism	soon	surrender
Topic 2:	feel	hand	interesting	knows	recall	sorry	thinking	wasn	wonder	wouldn
Topic 3:	card	computer	disk	dos	mac	memory	monitor	pc	sale	video
Topic 4:	cause	common	disease	effect	evidence	result	results	study	treatment	usually
Topic 5:	article	current	david	haven	imagine	oh	posting	sort	sounds	student
Topic 6:	bike	buy	car	cars	engine	miles	ride	road	speed	turn
Topic 7:	chip	clipper	encryption	key	keys	law	phone	public	secure	security
Topic 8:	1993	april	center	date	earth	nasa	national	research	space	university
Topic 9:	buy	care	cost	deal	market	money	pay	price	sell	worth
Topic 10:	control	country	crime	gun	guns	law	laws	public	rights	states
Topic 11:	btw	couple	difference	guess	haven	news	nice	original	regards	unfortunately
Topic 12:	anybody	article	bob	info	ok	reply	short	sorry	stuff	thank
Topic 13:	advance	appreciated	c

Топик 0: религия
Топик 3: компьютеры
Топик 4: медицина
Топик 6: машины
Топик 7: шифрование
Топик 8: космос
Топик 9: экономика
Топик 10: политика
Топик 14: история
Топик 15: хоккей
Топик 16: виндоус
Топик 17: ос виндоус

Почти все из этих топиков есть в списке тренировочных тем


In [16]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']