In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF
from tqdm import tqdm

In [4]:
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': '172.18.0.2', 'port': 9200}])

In [5]:
def print_top_words(model, feature_names, n_top_words):
    ret = {}
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        ret[topic_idx] = {feature_names[i]: topic[i]/sum(topic) for i in topic.argsort()[:-n_top_words - 1:-1]}
        message += " ".join([k + ' (%.5f)' % v for k, v in ret[topic_idx].items()])
        print(message)
    print()
    return ret

In [7]:
def decomposition_helper(data, method='lda', n_components=[10], vectorizer=CountVectorizer, print_num=20):
    vectorized = vectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)
    features = vectorized.fit_transform(data)
    ret_models = {}
    ret_topics = {}
    for nc in n_components:
        print('\nFor n_components=%d' % nc)
        if method == 'lda':
            m = LDA(n_components=nc, random_state=666, learning_method='online', n_jobs=-1)
        else:
            m = NMF(n_components=nc, random_state=666)
        m.fit(features)
        ret_models[nc] = m
        ret_topics[nc] = print_top_words(m, vectorized.get_feature_names(), print_num)
    return ret_models, ret_topics

In [8]:
def es_topic_indexer(topics, index):
    for topic_id, words_pd in topics.items():
        words, pds = [], []
        for word, pd in words_pd.items():
            words.append(word)
            pds.append(pd)
        doc = {
            'topic_id' : topic_id,
            'top_words' : words,
            'word_prob' : pds
        }
        es.index(index=index, doc_type='topic', body=doc)

In [9]:
def es_text_indexer(data, index, gold=False):
    for idx, doc in tqdm(data.items()):
        es_doc = {
            'doc_id' : idx,
            'doc_text' : doc['text'],
            'doc_topics' : ', '.join([str(x) for x in doc['topics'][:5]]),
            'doc_topics_pd' : ', '.join([str(x) for x in doc['pd'][:5]])
        }
        if gold:
            es_doc['gold_summary'] = doc['gold']
        es.index(index=index, doc_type='doc', body=es_doc)

In [10]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/',
    subset='train', remove=('headers', 'footers', 'quotes'),
    shuffle=True, random_state=666)

In [21]:
%%time
lda_models, lda_topics = decomposition_helper(news_train.data, method='lda', n_components=[10, 20, 50], vectorizer=CountVectorizer)



For n_components=10
Topic #0: file (0.02282) information (0.01824) program (0.01817) available (0.01518) use (0.01491) key (0.01482) data (0.01443) code (0.01006) number (0.00960) files (0.00908) window (0.00855) server (0.00853) version (0.00762) user (0.00732) using (0.00712) faq (0.00711) software (0.00704) output (0.00703) source (0.00699) sun (0.00694)
Topic #1: year (0.03561) game (0.03404) team (0.03172) games (0.02305) play (0.02094) new (0.01958) season (0.01825) hockey (0.01623) good (0.01607) players (0.01581) league (0.01567) win (0.01477) best (0.01373) san (0.01128) player (0.01061) teams (0.01056) points (0.01042) second (0.01016) division (0.00990) nhl (0.00989)
Topic #2: ax (0.76754) max (0.05604) g9v (0.01421) b8f (0.01401) a86 (0.01170) pl (0.00993) 145 (0.00987) 1d9 (0.00806) 34u (0.00674) 1t (0.00646) 0t (0.00631) 75u (0.00587) bhj (0.00584) 2di (0.00551) giz (0.00548) 3t (0.00529) wm (0.00434) 2tm (0.00423) 7ey (0.00362) 0d (0.00348)
Topic #3: windows (0.01914) d

Topic #0: stephanopoulos (0.29394) results (0.21654) designed (0.18541) process (0.16157) reports (0.10845) conference (0.01409) position (0.00340) individual (0.00002) general (0.00002) able (0.00002) satellite (0.00002) future (0.00002) term (0.00002) paper (0.00002) station (0.00002) 90 (0.00002) power (0.00002) 500 (0.00002) radio (0.00002) long (0.00002)
Topic #1: don (0.04664) know (0.04062) just (0.03489) think (0.03241) like (0.02980) really (0.02649) didn (0.02406) say (0.02346) going (0.02269) way (0.01924) let (0.01730) want (0.01473) tell (0.01450) time (0.01406) ll (0.01355) did (0.01331) thing (0.01252) sure (0.01250) believe (0.01136) idea (0.01103)
Topic #2: points (0.10294) 50 (0.09413) service (0.08693) quality (0.07627) tv (0.06725) 100 (0.05416) play (0.04910) 40 (0.04890) total (0.04243) modem (0.04220) dave (0.04134) likely (0.03110) times (0.02962) ll (0.02225) 14 (0.02063) 10 (0.01966) line (0.01587) 75 (0.01461) order (0.01426) hours (0.01334)
Topic #3: people 

In [11]:
%%time
nmf_models, nmf_topics = decomposition_helper(news_train.data, n_components=[10, 20, 50], vectorizer=CountVectorizer)




For n_components=10
Topic #0: file (0.02282) information (0.01824) program (0.01817) available (0.01518) use (0.01491) key (0.01482) data (0.01443) code (0.01006) number (0.00960) files (0.00908) window (0.00855) server (0.00853) version (0.00762) user (0.00732) using (0.00712) faq (0.00711) software (0.00704) output (0.00703) source (0.00699) sun (0.00694)
Topic #1: year (0.03561) game (0.03404) team (0.03172) games (0.02305) play (0.02094) new (0.01958) season (0.01825) hockey (0.01623) good (0.01607) players (0.01581) league (0.01567) win (0.01477) best (0.01373) san (0.01128) player (0.01061) teams (0.01056) points (0.01042) second (0.01016) division (0.00990) nhl (0.00989)
Topic #2: ax (0.76754) max (0.05604) g9v (0.01421) b8f (0.01401) a86 (0.01170) pl (0.00993) 145 (0.00987) 1d9 (0.00806) 34u (0.00674) 1t (0.00646) 0t (0.00631) 75u (0.00587) bhj (0.00584) 2di (0.00551) giz (0.00548) 3t (0.00529) wm (0.00434) 2tm (0.00423) 7ey (0.00362) 0d (0.00348)
Topic #3: windows (0.01914) d

Topic #0: stephanopoulos (0.29394) results (0.21654) designed (0.18541) process (0.16157) reports (0.10845) conference (0.01409) position (0.00340) individual (0.00002) general (0.00002) able (0.00002) satellite (0.00002) future (0.00002) term (0.00002) paper (0.00002) station (0.00002) 90 (0.00002) power (0.00002) 500 (0.00002) radio (0.00002) long (0.00002)
Topic #1: don (0.04664) know (0.04062) just (0.03489) think (0.03241) like (0.02980) really (0.02649) didn (0.02406) say (0.02346) going (0.02269) way (0.01924) let (0.01730) want (0.01473) tell (0.01450) time (0.01406) ll (0.01355) did (0.01331) thing (0.01252) sure (0.01250) believe (0.01136) idea (0.01103)
Topic #2: points (0.10294) 50 (0.09413) service (0.08693) quality (0.07627) tv (0.06725) 100 (0.05416) play (0.04910) 40 (0.04890) total (0.04243) modem (0.04220) dave (0.04134) likely (0.03110) times (0.02962) ll (0.02225) 14 (0.02063) 10 (0.01966) line (0.01587) 75 (0.01461) order (0.01426) hours (0.01334)
Topic #3: people 

In [52]:
es_topic_indexer(lda_topics[10], 'topics20ng')
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)
vectorizer.fit(news_train.data)
_20ng_data = {}
for idx, doc in tqdm(enumerate(news_train.data[:100])):
    _20ng_data[idx] = {}
    _20ng_data[idx]['text'] = doc
    pd = lda_models[10].transform(vectorizer.transform([doc]))[0]
    _20ng_data[idx]['topics'] = pd.argsort()[::-1]
    _20ng_data[idx]['pd'] = pd[pd.argsort()[::-1]]

es_text_indexer(_20ng_data, '20ng', False)



100it [00:20,  4.99it/s]
100%|██████████| 100/100 [00:00<00:00, 191.42it/s]


In [14]:
news_train.data[5], news_train.target_names[news_train.target[5]]

('\n\nso when is PRODIGY going to open the doors for inetgate to accept\ninternet mail eh?\n\nobviously if you can post news, mail should go through as well..\n\n',
 'comp.os.ms-windows.misc')

In [44]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
features = vectorizer.fit_transform(news_train.data)
for n_topics, model in lda_models.items():
    best_topic = model.transform(features[0])[0].argmax()
    print(n_topics, [k for k in lda_topics[n_topics][best_topic].keys()])
    print()

10 ['use', 'right', 'government', 'don', 'scsi', 'make', 'power', 'people', 'chip', 'law', 'like', 'need', 'used', 'understand', 'encryption', 'way', 'sure', 'bit', 'want', 'does']

20 ['government', 'gun', 'president', 'people', 'law', 'states', 'war', 'public', 'state', 'american', 'rights', 'military', 'united', 'control', 'years', 'laws', 'national', 'crime', 'guns', 'police']

50 ['people', 'make', 'use', 'used', 'means', 'does', 'water', 'mean', 'non', 'person', 'difference', 'usually', 'example', 'words', 'certain', 'job', 'simply', 'better', 'gm', 'sense']



## DUC Data

In [45]:
data_path = '../../data/DUC2001/'
docs = {}

def get_docs(path):
    import os
    from bs4 import BeautifulSoup
    gold_path = path + "Summaries"
    
    ret_dict = {}
    
    doc_files = os.listdir(path)
    gold_files = os.listdir(gold_path)

    for gold_file in gold_files:
        file_name =  gold_file.split('.')[0].upper()
        if file_name not in doc_files:
            print('no file for %s' % gold_file)
            continue

        sum_path = gold_path + '/' + gold_file

        with open(sum_path) as sum_file:                
            summary, doc = sum_file.read().strip().split('Introduction:')
            summary = summary.split('Abstract:')[1]
            
            ret_dict[file_name] = [doc.strip(), summary.strip()]
    return ret_dict

docs = get_docs(data_path)
duc_data = [v[0] for k, v in docs.items()]
lda_models_duc, lda_topics_duc = decomposition_helper(duc_data, method='lda', n_components=[10, 20, 50], vectorizer=CountVectorizer)


no file for ap900928-0054.txt
no file for ap890325-0143.txt
no file for ap900322-0200_system.txt

For n_components=10
Topic #0: said (0.03965) johnson (0.03182) eclipse (0.02951) sun (0.01528) taylor (0.01150) people (0.01070) world (0.00955) ben (0.00793) don (0.00750) lewis (0.00694) year (0.00694) new (0.00667) 000 (0.00660) solar (0.00646) moon (0.00625) drugs (0.00607) years (0.00605) time (0.00604) mexico (0.00601) record (0.00586)
Topic #1: said (0.04877) police (0.01804) hurricane (0.01305) year (0.00937) national (0.00937) department (0.00899) earthquake (0.00862) people (0.00829) officers (0.00731) forest (0.00692) officials (0.00688) 000 (0.00673) service (0.00666) area (0.00647) chief (0.00637) fires (0.00620) damage (0.00591) years (0.00573) city (0.00571) report (0.00550)
Topic #2: gun (0.02148) right (0.01837) shining (0.01798) path (0.01788) drug (0.01307) assassination (0.01292) amendment (0.01276) nra (0.01264) people (0.01202) police (0.01189) said (0.01158) second (

Topic #0: gray (0.03521) conference (0.02737) annual (0.02052) 1984 (0.01776) hurricanes (0.01718) average (0.01510) form (0.01119) management (0.01060) money (0.01056) change (0.01051) start (0.00915) el (0.00874) 40 (0.00749) said (0.00643) 1985 (0.00613) predicted (0.00569) issue (0.00470) seven (0.00460) light (0.00452) friday (0.00427)
Topic #1: earthquake (0.05705) area (0.02373) year (0.02120) said (0.02044) earthquakes (0.01749) japan (0.01517) quake (0.01364) scale (0.01357) damage (0.01331) california (0.01322) 000 (0.01256) people (0.01032) region (0.00934) areas (0.00932) survey (0.00911) scientists (0.00893) santa (0.00865) water (0.00843) recorded (0.00794) reported (0.00785)
Topic #2: arms (0.00373) militia (0.00355) right (0.00293) gun (0.00269) amendment (0.00244) link (0.00230) bear (0.00228) nafta (0.00223) congress (0.00222) people (0.00220) tunnel (0.00210) rail (0.00189) state (0.00187) service (0.00184) free (0.00172) second (0.00169) constitutional (0.00169) pre

In [46]:
es_topic_indexer(lda_topics_duc[10], 'topicsduc')
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)

vectorizer.fit(duc_data)
_duc_data = {}

for i, (idx, doc) in tqdm(enumerate(docs.items())):
    _duc_data[idx] = {}
    _duc_data[idx]['text'] = doc[0]
    _duc_data[idx]['gold'] = doc[1]
    pd = lda_models_duc[10].transform(vectorizer.transform([doc[0]]))[0]
    _duc_data[idx]['topics'] = pd.argsort()[::-1]
    _duc_data[idx]['pd'] = pd[pd.argsort()[::-1]]
    
es_text_indexer(_duc_data, 'duc', gold=True)

301it [00:54,  5.57it/s]
100%|██████████| 301/301 [00:01<00:00, 168.07it/s]


In [47]:
_text = {
    'size' : 301,
    'query': {
        'match_all' : {}
    }
}
data = es.search(index='duc', doc_type='doc', body=_text)['hits']['hits']

In [48]:
data

[{'_id': 'psL4-GIBkVsTvEc79OwI',
  '_index': 'duc',
  '_score': 1.0,
  '_source': {'doc_id': 'AP901010-0036',
   'doc_text': "A U.S. Air Force F-111\nfighter-bomber crashed today in Saudi Arabia, killing both crew\nmembers, U.S. military officials reported.\n   It was the fourth American aircraft to crash in three days among\nthose deployed to this kingdom for Operation Desert Shield.\n   Eight Marines are missing in the crash of two helicopters in the\nnorthern Arabian Sea on Monday. An Air Force F-4 reconnaissance jet\nalso went down that day, killing both crew members.\n   Lt. Cmdr. J.D. van Sickle, a military spokesman, said the F-111\ncrashed in the ``southern Arabian peninsula'' while on a training\nmission and that the incident was under investigation.\n   The names of the flyers were withheld pending notification of\nnext of kin.\n   The aircraft was attached the 48th Tactical Fighter Wing based\nat RAF Lakenheath Air Base in Britain. The wing was sent to Turkey\nas U.S. forces

In [50]:
_duc_data['AP880217-0175'].keys()

dict_keys(['text', 'gold', 'topics', 'pd'])