# Synopsis

Create an LDA of 20news corpus using MALLET.

# Configuration

In [5]:
src_file = '20news_01.csv'

In [6]:
mallet_path = '/usr/local/Cellar/mallet/2.0.8/bin/mallet'
num_topics = 15
num_iters = 1000
show_interval = 100

# Libraries

In [7]:
import pandas as pd
import numpy as np
import sqlite3
import re
import random
import textman as tx
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# Pragmas

In [8]:
%matplotlib inline

# Process

## Get corpus

In [9]:
docs = pd.read_csv(src_file, sep='\t')
docs = docs.set_index('doc_id')

## Convert corpus to tokens and vocab

We use a function from TextMan, a bespoke library that incorporates the text processing routines used in earlier notebooks.

In [10]:
tokens, vocab = tx.create_tokens_and_vocab(docs, src_col='doc_content')
tokens['token_num'] = tokens.groupby(['doc_id']).cumcount()
tokens = tokens.reset_index()[['doc_id','token_num','term_id']]
tokens = tokens[tokens.term_id.isin(vocab[vocab.go].index)]
tokens = tokens.set_index(['doc_id','token_num'])

### Add term strings

In [11]:
tokens['term_str'] = tokens.term_id.map(vocab.term)

In [12]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_id,term_str
doc_id,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
76209,0,4557,people
76209,1,5848,sure
76209,2,4713,posts
76209,3,2671,forwarded
76209,4,5882,system


## Remove insignificant words

We use SKlearn's TFIDF vectorizor to quicky get a TFIDF vector space, which we use only to filter the words in our corpus.

In [13]:
vectorizer = TfidfVectorizer(use_idf=1, stop_words='english', token_pattern=r'[A-Za-z][A-Za-z][A-Za-z]+')
X = vectorizer.fit_transform(docs.doc_content.values.tolist())
v = pd.DataFrame(vectorizer.get_feature_names(), columns=['term_str'])
v['idf'] = vectorizer.idf_

In [14]:
v.sort_values('idf', ascending=False).head(10)

Unnamed: 0,term_str,idf
0,aaa,4.921973
3491,nicely,4.921973
3488,nhlpa,4.921973
3486,nga,4.921973
3484,newswriter,4.921973
3483,newsweek,4.921973
3481,newspaper,4.921973
3480,newsgroups,4.921973
3478,newsbytes,4.921973
3476,newly,4.921973


### Take only the most significant words

In [15]:
cutoff = 4.5
v = v[v.idf > cutoff].sort_values('idf', ascending=False).sample(1000)
my_v = v.term_str.tolist()

In [16]:
tokens = tokens[tokens.term_str.isin(my_v)]

In [17]:
# vocab = vocab[vocab.term.isin(my_v)]

## Export corpus for MALLET 

In [18]:
corpus = tx.gather_tokens(tokens, level=0, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})

In [19]:
corpus.head()

Unnamed: 0,doc_id,doc_content
0,20567,vela afterlife afterlife afterlife vela
1,20607,easter easter easter easter easter easter asso...
2,20741,saved saved champions
3,20758,known theology theology visible regards orthod...
4,20859,prayers picture picture picture prayers wounde...


In [20]:
corpus.to_csv('20news-corpus.csv', index=False)

In [21]:
!{mallet_path}

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

In [22]:
!{mallet_path} import-file --input 20news-corpus.csv --output 20news-corpus.mallet --keep-sequence TRUE

In [23]:
mallet_config = """
num-topics = {num_topics} 
mynum-iterations = {num_iters} 
myoutput-doc-topics = 20news-doc-topics.txt 
myoutput-topic-keys = 20news-topic-keys.txt 
myword-topic-counts-file = 20news-word-topic-counts-file.txt 
mytopic-word-weights-file = 20news-topic-word-weights-file.txt 
myxml-topic-report = 20news-topic-report.xml 
myxml-topic-phrase-report = 20news-topic-phrase-report.xml 
myshow-topics-interval {show_interval} 
myuse-symmetric-alpha = false  
myoptimize-interval = 100 
mydiagnostics-file = 20news-diagnostics.xml
"""
mallet_config_file = 'config.txt'
with  open(mallet_config_file, 'w') as myfile:
    myfile.write(mallet_config)

In [25]:
!{mallet_path} train-topics --input 20news-corpus.mallet --config config.txt

Unable to process configuration file: For input string: "{num_topics}"
Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 72
total tokens: 425
<10> LL/token: -5.07248
<20> LL/token: -4.96296
<30> LL/token: -4.85488
<40> LL/token: -4.79794

0	0.5	mar juris russotto bounced boulder suns cnn umd regards alternative kimbark known motss abu centris 
1	0.5	essence godhead orthodox asignation gregory substance equal identical ascribed distinctions visible theology salaries 
2	0.5	lie hovig cso bayonets mode champions pouring wpi drivers 
3	0.5	edm det tor hfd phi pit stl saved 
4	0.5	yankees girls printer maddux rickert makhlouf masses abu licence fonts strip iii 
5	0.5	journalists shot religion throws messiah wounds rotation reno btr unm barry drivers visible 
6	0.5	sabo decisions bryn plenty hade hernlem greig veal harvard 
7	0.5	reasoning arguing principles senators record representatives kaldis rule afterlife nextwork belong 
8	0.5	easter khojalu born wounded pi