In [None]:
import json

from sklearn.feature_extraction.text import TfidfVectorizer

import sqlitedatastore as datastore


datastore.connect()

data = []
doc_ids = []
for doc_id in datastore.get_all_ids(limit=-1):
    data.append(' '.join(
        [token['lemma'] for token in datastore.get_annotation(doc_id, 'token')]))
    doc_ids.append(doc_id)

vectorizer = TfidfVectorizer(analyzer='word', max_df=0.9)
vecs = vectorizer.fit_transform(data)

for doc_id, vec in zip(doc_ids, vecs.toarray()):
    meta_info = json.loads(datastore.get(doc_id, ['meta_info'])['meta_info'])
    title = meta_info['title']
    print(doc_id, title)

    for w_id, tfidf in sorted(enumerate(vec), key=lambda x: x[1], reverse=True)[:10]:
        lemma = vectorizer.get_feature_names()[w_id]
        print('\t{0:s}: {1:f}'.format(lemma, tfidf))
datastore.close()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import sqlitedatastore as datastore

datastore.connect()

data = []
doc_ids = []
for doc_id in datastore.get_all_ids(limit=-1):
    data.append(' '.join(
        [token['lemma'] for token in datastore.get_annotation(doc_id, 'token')]))
    doc_ids.append(doc_id)

vectorizer = TfidfVectorizer(analyzer='word', max_df=0.9)
vecs = vectorizer.fit_transform(data)

sim = cosine_similarity(vecs)
docs = zip(doc_ids, sim[0])
for doc_id, similarity in sorted(docs, key=lambda x: x[1], reverse=True):
    meta_info = json.loads(datastore.get(doc_id, ['meta_info'])['meta_info'])
    title = meta_info['title']
    print(doc_id, title, similarity)
datastore.close()


Solrで実行したクエリ
```
http://localhost:8983/solr/doc/select?mlt.count=10&mlt=true&q=id:1&mlt.fl=content_txt_ja&mlt.maxdfpct=90&fl=id,title_txt_ja
```

In [None]:
import statistics

datastore.connect()
lm = statistics.create_language_model(datastore.get_all_ids(limit = -1), N=3)
context = ('古く', 'から')
print(context, '->')

prob_list = [(word, lm.score(word, context)) for word
            in lm.context_counts(lm.vocab.lookup(context))]
prob_list.sort(key=lambda x: x[1], reverse=True)
for word, prob in prob_list:
    print('\t{:s}: {:f}'.format(word, prob))
datastore.close()


In [None]:
import random

import cabochaparser   as parser

datastore.connect()
lm = statistics.create_language_model(datastore.get_all_ids(limit=-1), N=3)

text = '古くから人が居住する。'
sentences, chunks, tokens = parser.parse(text)

probabilities = set([])
for i in range(1000):
    tokens_ = tokens[1:]
    random.shuffle(tokens_)
    tokens_shuffled = [tokens[0]] + tokens_
    lemmas = ['__BOS__'] + [token['lemma']
                            for token in tokens_shuffled] + ['__EOS__']
    shuffled_text = ''.join(
        [text[token['begin']:token['end']] for token in tokens_shuffled])
    probability = statistics.calc_prob(lm, lemmas, N=3)
    probabilities.add((probability, shuffled_text))

for probability, shuffled_text in sorted(list(probabilities), reverse=True)[:20]:
    print('{0:e}: {1:s}'.format(probability, shuffled_text))
datastore.close()


In [1]:
import itertools
import json
import logging
import math
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

from annoutil import find_xs_in_y
import sqlitedatastore as datastore

logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


datastore.connect()
sentences = []
for doc_id in datastore.get_all_ids(limit=-1):
    all_tokens = datastore.get_annotation(doc_id, 'token')
    for sent in datastore.get_annotation(doc_id, 'sentence'):
        tokens = find_xs_in_y(all_tokens, sent)
        sentences.append([token['lemma'] for token in tokens 
                          if token.get('NE') == 'O'])

n_sent = 20
docs = [list(itertools.chain.from_iterable(sentences[i:i+n_sent]))
        for i in range(0, len(sentences), n_sent)]

dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=2, no_above=0.3)
corpus = [dictionary.doc2bow(doc) for doc in docs]

lda = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

# 主題の確認
for topic in lda.show_topics(num_topics=-1, num_words=10):
    print('topic id:{0[0]:d}, words={0[1]:s}'.format(topic))

# 記事の主題分布の推定
for doc_id in datastore.get_all_ids(limit=-1):
    meta_info = json.loads(datastore.get(doc_id, ['meta_info'])[ 'meta_info'])
    title = meta_info['title']
    print(title)

    doc = [token['lemma'] for token in datastore.get_annotation(doc_id, 'token')
           if token.get('NE') == 'O']
    for topic in sorted(lda.get_document_topics(dictionary.doc2bow(doc)),
                        key=lambda x: x[1], reverse=True):
        print('\ttopic id:{0[0]:d}, prob={0[1]:f}'.format(topic))
datastore.close()


2020-04-30 00:22:50,230 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-04-30 00:22:50,407 : INFO : built Dictionary(7946 unique tokens: ['*', '、', '「', '」', 'ある']...) from 278 documents (total 140636 corpus positions)
2020-04-30 00:22:50,430 : INFO : discarding 3686 tokens: [('*', 278), ('、', 276), ('「', 157), ('」', 157), ('ある', 260), ('から', 248), ('が', 274), ('する', 276), ('た', 269), ('だ', 268)]...
2020-04-30 00:22:50,431 : INFO : keeping 4260 tokens which were in no less than 2 and no more than 83 (=30.0%) documents
2020-04-30 00:22:50,440 : INFO : resulting dictionary: Dictionary(4260 unique tokens: ['リーダー', '一院制', '並ぶ', '他', '任期']...)
2020-04-30 00:22:50,507 : INFO : using symmetric alpha at 0.1
2020-04-30 00:22:50,508 : INFO : using symmetric eta at 0.1
2020-04-30 00:22:50,509 : INFO : using serial LDA version on this node
2020-04-30 00:22:50,515 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 278 documents, u

topic id:0, words=0.020*"語" + 0.016*"教育" + 0.016*"英語" + 0.012*"版" + 0.010*"系" + 0.006*"宗教" + 0.006*"上昇" + 0.006*"か" + 0.005*"言語" + 0.005*"フランス語"
topic id:1, words=0.008*"島" + 0.006*"他" + 0.005*"人口" + 0.005*"および" + 0.005*"約" + 0.004*"』" + 0.004*"『" + 0.004*"位置" + 0.004*"表記" + 0.004*"面積"
topic id:2, words=0.014*"系" + 0.008*"機" + 0.007*"鉄道" + 0.006*"航空" + 0.006*"移民" + 0.006*"陸軍" + 0.005*"網" + 0.005*"海軍" + 0.005*"軍事" + 0.005*"人種"
topic id:3, words=0.010*"朝" + 0.007*"王国" + 0.007*"『" + 0.007*"』" + 0.006*"系" + 0.005*"文化" + 0.005*"時代" + 0.005*"天皇" + 0.004*"関係" + 0.004*"銃"
topic id:4, words=0.016*"主義" + 0.014*"軍事" + 0.010*"費" + 0.009*"映画" + 0.008*"戦争" + 0.008*"比率" + 0.008*"に対する" + 0.007*"産業" + 0.007*"次" + 0.006*"社会"
topic id:5, words=0.014*"輸出" + 0.011*"月" + 0.010*"気候" + 0.010*"位" + 0.009*"占める" + 0.009*"産業" + 0.008*"生産" + 0.007*"農業" + 0.006*"工業" + 0.006*"量"
topic id:6, words=0.011*"遺産" + 0.009*"文化" + 0.009*"出典" + 0.008*"要" + 0.008*"スポーツ" + 0.007*"高い" + 0.007*"人気" + 0.006*"技術" + 0.006*"ドル" + 0.0

In [2]:
import gensim
print(gensim.__version__)

3.8.1
