# Configuration

In [313]:
corpus_db = '../2019-02-28_Lab07/novels.db'
max_words = 10000

# For MALLET
num_topics = 20
num_iters = 1000
show_interval = 100

#  Libraries

In [314]:
import gensim
import pandas as pd
import sqlite3
import textman as tx

# Process

## Import novel corpus from database

We use SQL to get what we want quickly.

In [315]:
sql = """
SELECT * FROM token 
WHERE term_id IN (SELECT term_id FROM vocab WHERE stop = 0 ORDER BY tfidf_sum DESC LIMIT {})
-- AND (author = 'poe' OR author = 'austen') 
-- AND (pos LIKE 'N%' OR pos LIKE 'V%')
""".format(max_words)

In [316]:
with sqlite3.connect(corpus_db) as db:
    tokens = pd.read_sql(sql, db)

## Fix tokens dataframe

In [317]:
tokens = tokens.set_index(['author','book','chapter'])

In [318]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,genre,para_num,sent_num,token_num,pos,token_str,punc,num,term_str,term_id
author,book,chapter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
christie,secretadversary,1,d,0,1,1,NNP,YOUNG,0,0,young,27354
christie,secretadversary,1,d,0,1,2,NNP,"ADVENTURERS,",0,0,adventurers,399
christie,secretadversary,1,d,1,0,0,JJ,"“TOMMY,",0,0,tommy,24529
christie,secretadversary,1,d,1,0,1,JJ,old,0,0,old,16509
christie,secretadversary,1,d,1,0,2,NN,thing!”,0,0,thing,24202


## Convert tokens to a corpus for MALLET input

In [319]:
corpus = tx.gather_tokens(tokens, level=2, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})
corpus['doc_label'] = corpus.apply(lambda x: "doyle-{}-{}".format(x.book, x.chapter), 1)

In [320]:
corpus.head()

Unnamed: 0,author,book,chapter,doc_content,doc_label
0,austen,northangerabbey,1,seen catherine morland infancy supposed born h...,doyle-northangerabbey-1
1,austen,northangerabbey,2,addition already said catherine morlands perso...,doyle-northangerabbey-2
2,austen,northangerabbey,3,morning brought regular duties shops visited n...,doyle-northangerabbey-3
3,austen,northangerabbey,4,usual eagerness catherine hasten pump room nex...,doyle-northangerabbey-4
4,austen,northangerabbey,5,catherine engaged theatre evening returning sm...,doyle-northangerabbey-5


## Dump corpus to CSV file

In [321]:
corpus[['doc_label','doc_content']].to_csv('corpus.csv', index=False)

## MALLET Time

### Show MALLET options

In [322]:
!mallet 

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

### Import corpus

In [323]:
!mallet import-file --input corpus.csv --output corpus.mallet --keep-sequence TRUE

### Train topics

In [324]:
!mallet train-topics --input corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics mallet-doc-topics.txt \
--output-topic-keys mallet-topic-keys.txt \
--word-topic-counts-file mallet-word-topic-counts-file.txt \
--topic-word-weights-file mallet-topic-word-weights-file.txt \
--xml-topic-report mallet-topic-report.xml \
--xml-topic-phrase-report mallet-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file mallet-diagnostics.xml

Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 8264
total tokens: 589337
<10> LL/token: -9.40039
<20> LL/token: -8.9796
<30> LL/token: -8.83373
<40> LL/token: -8.7603
<50> LL/token: -8.71759
<60> LL/token: -8.68161
<70> LL/token: -8.65725
<80> LL/token: -8.63744
<90> LL/token: -8.6178

0	0.25	found body murder within whole windows darkness object known let far fact river evidence thousand dark boat less thus sound 
1	0.25	sir man watson henry moor baskerville since know across night cannot death last mortimer learned none dog strange stapleton path 
2	0.25	heard door night chamber light room castle seemed opened open length place distance hour great soon steps said followed terror 
3	0.25	tuppence tommy said julius sir know dont thats james jane right mrs girl young vandemeyer ive cynthia brown youre yes 
4	0.25	give daughter project bosom full name immediately possible hours part terms person arrived design character concealed discover take absence prep