# Synopsis

Create an LDA of 20news corpus using MALLET.

# Configuration

In [10]:
src_file = '20news_01.csv'

In [29]:
mallet_path = '/usr/local/Cellar/mallet/2.0.8/bin/mallet'
num_topics = 15
num_iters = 1000
show_interval = 100

# Libraries

In [12]:
import pandas as pd
import numpy as np
import sqlite3
import re
import random
import textman as tx
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# Pragmas

In [13]:
%matplotlib inline

# Process

## Get corpus

In [14]:
docs = pd.read_csv(src_file, sep='\t')
docs = docs.set_index('doc_id')

## Convert corpus to tokens and vocab

We use a function from TextMan, a bespoke library that incorporates the text processing routines used in earlier notebooks.

In [15]:
tokens, vocab = tx.create_tokens_and_vocab(docs, src_col='doc_content')
tokens['token_num'] = tokens.groupby(['doc_id']).cumcount()
tokens = tokens.reset_index()[['doc_id','token_num','term_id']]
tokens = tokens[tokens.term_id.isin(vocab[vocab.go].index)]
tokens = tokens.set_index(['doc_id','token_num'])

### Add term strings

In [16]:
tokens['term_str'] = tokens.term_id.map(vocab.term)

In [17]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_id,term_str
doc_id,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
76209,0,4557,people
76209,1,5848,sure
76209,2,4713,posts
76209,3,2671,forwarded
76209,4,5882,system


## Remove insignificant words

We use SKlearn's TFIDF vectorizor to quicky get a TFIDF vector space, which we use only to filter the words in our corpus.

In [18]:
vectorizer = TfidfVectorizer(use_idf=1, stop_words='english', token_pattern=r'[A-Za-z][A-Za-z][A-Za-z]+')
X = vectorizer.fit_transform(docs.doc_content.values.tolist())
v = pd.DataFrame(vectorizer.get_feature_names(), columns=['term_str'])
v['idf'] = vectorizer.idf_

In [19]:
v.sort_values('idf', ascending=False).head(10)

Unnamed: 0,term_str,idf
0,aaa,4.921973
3491,nicely,4.921973
3488,nhlpa,4.921973
3486,nga,4.921973
3484,newswriter,4.921973
3483,newsweek,4.921973
3481,newspaper,4.921973
3480,newsgroups,4.921973
3478,newsbytes,4.921973
3476,newly,4.921973


### Take only the most significant words

In [20]:
cutoff = 4.5
v = v[v.idf > cutoff].sort_values('idf', ascending=False).sample(1000)
my_v = v.term_str.tolist()

In [21]:
tokens = tokens[tokens.term_str.isin(my_v)]

In [22]:
# vocab = vocab[vocab.term.isin(my_v)]

## Export corpus for MALLET 

In [23]:
corpus = tx.gather_tokens(tokens, level=0, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})

In [24]:
corpus.head()

Unnamed: 0,doc_id,doc_content
0,20567,atterlep hmmm atheists atheists hmmm atheists ...
1,20607,english easter easter easter english easter en...
2,20741,deeds deeds
3,20758,orthodox adherents unity unity eternality eter...
4,20859,female desk desk desk desk piece piece female ...


In [25]:
corpus.to_csv('20news-corpus.csv', index=False)

In [26]:
!{mallet_path}

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

In [27]:
!{mallet_path} import-file --input 20news-corpus.csv --output 20news-corpus.mallet --keep-sequence TRUE

In [28]:
!{mallet_path} train-topics --input 20news-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics 20news-doc-topics.txt \
--output-topic-keys 20news-topic-keys.txt \
--word-topic-counts-file 20news-word-topic-counts-file.txt \
--topic-word-weights-file 20news-topic-word-weights-file.txt \
--xml-topic-report 20news-topic-report.xml \
--xml-topic-phrase-report 20news-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file 20news-diagnostics.xml

Mallet LDA: 15 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 50
total tokens: 471
<10> LL/token: -5.27234
<20> LL/token: -5.21331
<30> LL/token: -5.07567
<40> LL/token: -5.04412
<50> LL/token: -4.94267
<60> LL/token: -4.90945
<70> LL/token: -4.87448
<80> LL/token: -4.83011
<90> LL/token: -4.83432

0	0.33333	decisions level reasoning concentrate arguing vetos senators papers authority capable powerful atheists handle representatives errey 
1	0.33333	juris date iivx documents caught hmmm wish misc usc gsfc 
2	0.33333	vay mutlu org loving columbia dbd crap success turk biggest criminals atterlep 
3	0.33333	karabag bayonetted huseyin agdam reactions village slaughter cold azerbaijan drivers 
4	0.33333	eternality persons concept criticism unity drugs orthodox adherents regard equal biblical stereo prior farzin deeds 
5	0.33333	pitching leadoff straight chimes russotto utkvx boxscores pouring min 
6	0.33333	det edm que min centris 
7	0.33333	bryn walsh aol hade essay quality

In [21]:
20

20