In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from gensim import corpora, models, similarities, matutils
from data_extract_clean import stem_tokenizer
from analysis_functions import display_topics
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
chapters = pd.read_pickle('../data_files/chapter_corpus.pickle')

In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['becaus', 'said'])
porter_stemmer = PorterStemmer()
stop_words_stemmed = [porter_stemmer.stem(word) for word in stop_words]

## Topic modeling: compare algorithms and models

### Vectorize

Using parameters from the tests in the notebook "doc_term_matrix_size" notebook.

In [4]:
# Chapters. Tf-IDF with stemming, min_df, max_df
tf1 = TfidfVectorizer(stop_words=stop_words_stemmed, ngram_range=(1, 1), tokenizer=stem_tokenizer, min_df=2, max_df=0.9)
chapters_tf1 = tf1.fit_transform(chapters)
chapters_dtm = pd.DataFrame(chapters_tf1.toarray(), columns=tf1.get_feature_names())

### LSA

In [5]:
lsa = TruncatedSVD(15)
doc_topic = lsa.fit_transform(chapters_dtm)
lsa.explained_variance_ratio_

array([0.00920405, 0.04420284, 0.03208759, 0.0279908 , 0.02507649,
       0.01969872, 0.0178067 , 0.01569217, 0.01390436, 0.01276063,
       0.01131965, 0.01012168, 0.00957697, 0.00955916, 0.00882688])

In [6]:
sum(lsa.explained_variance_ratio_)

0.26782868505682506

In [7]:
display_topics(lsa, tf1.get_feature_names(), 15)


Topic  0
rand, egwen, perrin, nynaev, mat, elayn, moirain, aviendha, lan, min, siuan, lord, aiel, thom, trolloc

Topic  1
nynaev, elayn, egwen, siuan, birgitt, sheriam, amyrlin, elaida, aviendha, novic, tower, sister, moghedien, ajah, romanda

Topic  2
mat, thom, rand, tuon, bloodi, dice, noal, olver, tylin, talman, jolin, luca, egeanin, selucia, juilin

Topic  3
elayn, perrin, nynaev, birgitt, fail, thom, galad, juilin, mat, luca, gaul, berelain, reann, hopper, whitecloak

Topic  4
mat, siuan, egwen, tuon, thom, perrin, amyrlin, bryne, sheriam, elaida, talman, noal, sitter, lelain, bloodi

Topic  5
nynaev, loial, moirain, egwen, lan, rand, ingtar, hurin, perrin, thom, ogier, verin, trolloc, fain, selen

Topic  6
siuan, cadsuan, min, moirain, sister, merean, nynaev, tamra, room, loial, myrel, street, logain, ladi, lean

Topic  7
cadsuan, min, egwen, nynaev, tuon, gawyn, seanchan, verin, daman, galad, rand, mat, alanna, perrin, semirhag

Topic  8
elayn, gawyn, birgitt, trolloc, androl,

In [8]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = chapters.index)
Vt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.29825,-0.08435,0.00499,-0.04596,-0.01862,-0.03486,0.03860,-0.18299,-0.00795,-0.19560,0.22519,-0.00951,-0.00916,-0.15185,0.02623
1,0.26033,0.09163,-0.11884,-0.18856,0.14450,-0.01427,0.27197,-0.25863,-0.02397,-0.06780,-0.04258,0.01399,-0.02378,0.02102,-0.00441
2,0.36504,0.13653,-0.18595,-0.25632,0.21412,-0.07637,0.40004,-0.31146,-0.02121,-0.05955,-0.08361,-0.00114,0.03205,0.13082,0.01102
3,0.33236,0.06418,-0.10695,-0.16674,0.18094,-0.07357,0.33853,-0.24177,-0.00471,-0.01061,-0.02088,-0.01280,0.07003,0.05695,0.01918
4,0.33638,0.08491,-0.14892,-0.22384,0.19916,-0.05446,0.37861,-0.34055,-0.03127,-0.08366,-0.06263,-0.00115,0.04313,0.04678,0.01258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0.32942,-0.34041,-0.14458,0.19011,0.07913,0.06161,-0.11278,0.07322,0.00060,-0.12460,-0.11998,0.06262,0.01033,0.15952,-0.01684
700,0.31001,-0.20433,-0.07296,0.04439,-0.11509,-0.07030,-0.12208,0.00911,0.04162,-0.10809,-0.06221,0.08851,-0.00100,0.21042,-0.00220
701,0.22579,-0.00398,0.02998,-0.02122,-0.09334,-0.09871,-0.02736,-0.02420,0.12164,-0.11240,-0.00627,0.03522,-0.02136,0.21027,0.01544
702,0.38527,-0.27729,-0.13089,0.04401,-0.09666,0.11449,-0.04039,0.03280,-0.03083,-0.08349,-0.06343,-0.08716,-0.02096,0.15128,-0.05095


**Impression**  
There is a lot of overlap in the topics here, but it would be possible to name some of them based on the included words. I know from earlier tests that NMF comes up with better defined topics, so I am going to use that over LSA and compare to LDA below

### NMF

In [9]:
nmf_model = NMF(15)
doc_topic = nmf_model.fit_transform(chapters_dtm)

In [10]:
# Naming topics from NMF:
topics = ['rand_main', 'Nynaeve', 'mat', 'perrin_wolfbrother', 'egwene_amyrlin', 'loial_rand', 'siuan_moiraine',
         'aviendha_aiel', 'elayne_queen', 'moiraine_lan', 'tuon_seanchan', 'puppeteers_main', 'min_viewings', 'black_tower',
         'gawyn']

In [11]:
display_topics(nmf_model, tf1.get_feature_names(), 15, topic_names=topics)


Topic: ' rand_main '
rand, therin, lew, basher, dragon, lord, tam, taim, saidin, maiden, sword, citi, aiel, weiramon, kill

Topic: ' Nynaeve '
nynaev, elayn, moghedien, thom, juilin, liandrin, luca, dream, telaranrhiod, door, channel, braid, galad, ship, tanchico

Topic: ' mat '
mat, thom, bloodi, talman, dice, olver, noal, gleeman, gholam, vanin, nalesean, cauthon, inn, mayb, tylin

Topic: ' perrin_wolfbrother '
perrin, fail, berelain, gaul, hopper, wolv, elya, masema, slayer, wolf, aram, galad, whitecloak, smell, lord

Topic: ' egwene_amyrlin '
egwen, amyrlin, sheriam, elaida, tower, verin, novic, sitter, ajah, silviana, romanda, hall, dream, sister, mother

Topic: ' loial_rand '
loial, hurin, rand, ingtar, ogier, verin, selen, horn, fain, lord, sted, waygat, trolloc, sniffer, perrin

Topic: ' siuan_moiraine '
siuan, moirain, sister, sheriam, bryne, elaida, myrel, amyrlin, lelain, tower, novic, romanda, sitter, lean, tamra

Topic: ' aviendha_aiel '
aviendha, aiel, wise, ami, rhuarc,

**Impressions**  
These topics are clearly defined character arcs and can be used as such. Most topics start with the name of the central character(s) in that arc. NMF does a great job of separating them out. I will compare to LDA below.

**Note:** Increasing the number of topics still yields clearly defined plots, but I consider them more minor ones (often spanning only a single book), so I am leaving them out of scope for the larger analysis.

In [12]:
doc_topics_df = pd.DataFrame(doc_topic, columns= topics)

### LDA

Using Gensim requires the doc term matrix to be in a slightly different format. I need to keep the sparse matrix (not a Dataframe), transpose it and create a dictionary with word positions.

In [13]:
# Take the sparse term-document matrix
# Transpose it so the terms are the rows
doc_word = chapters_tf1.transpose()

##### Convert to gensim
We need to convert our sparse `scipy` matrix to a `gensim`-friendly object called a Corpus:

In [14]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

##### Map matrix rows to words (tokens)
We need to save a mapping (dict) of row id to word (token) for later use by gensim:

In [15]:
id2word = dict((v, k) for k, v in tf1.vocabulary_.items())

In [16]:
# Create lda model
lda = models.LdaModel(corpus=corpus, num_topics=15, id2word=id2word, passes=50)

In [17]:
lda.print_topics()

[(0,
  '0.006*"rand" + 0.005*"perrin" + 0.004*"egwen" + 0.004*"mat" + 0.004*"nynaev" + 0.004*"elayn" + 0.003*"moirain" + 0.002*"siuan" + 0.002*"aviendha" + 0.002*"min"'),
 (1,
  '0.002*"merean" + 0.001*"edeyn" + 0.001*"almen" + 0.001*"bri" + 0.001*"larel" + 0.001*"isel" + 0.001*"diryk" + 0.000*"charn" + 0.000*"aesdaishar" + 0.000*"adan"'),
 (2,
  '0.000*"mat" + 0.000*"rand" + 0.000*"thom" + 0.000*"egwen" + 0.000*"feran" + 0.000*"elayn" + 0.000*"barthan" + 0.000*"moirain" + 0.000*"nynaev" + 0.000*"lan"'),
 (3,
  '0.001*"alei" + 0.001*"shielyn" + 0.001*"alseen" + 0.000*"jac" + 0.000*"jess" + 0.000*"dimana" + 0.000*"maril" + 0.000*"adelorna" + 0.000*"wil" + 0.000*"lewin"'),
 (4,
  '0.001*"rolan" + 0.001*"lacil" + 0.001*"arrela" + 0.001*"chiad" + 0.001*"iralin" + 0.001*"jenn" + 0.001*"lewin" + 0.001*"bain" + 0.001*"zerah" + 0.000*"essand"'),
 (5,
  '0.002*"darkhound" + 0.001*"garenia" + 0.001*"talaan" + 0.001*"ewin" + 0.001*"naeff" + 0.001*"cair" + 0.001*"mishrail" + 0.001*"barim" + 0.001*

**Impressions**  
Many of these topics don't make sense to me. 

**Note:** It seems like after using Tf-IDF, LDA gets fixated on names and non-English (fictional) nouns. Trying this again with a normal CountVectorizer yields the following:

In [18]:
cv1 = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.9)
chapters_cv1 = cv1.fit_transform(chapters)
chapters_dtm_cv = pd.DataFrame(chapters_cv1.toarray(), columns=cv1.get_feature_names())

In [19]:
doc_word = chapters_cv1.transpose()
corpus = matutils.Sparse2Corpus(doc_word)
id2word = dict((v, k) for k, v in cv1.vocabulary_.items())

In [20]:
lda = models.LdaModel(corpus=corpus, num_topics=15, id2word=id2word, passes=50)
lda.print_topics()

[(0,
  '0.022*"rand" + 0.018*"mat" + 0.009*"thom" + 0.005*"room" + 0.005*"street" + 0.005*"inn" + 0.005*"cloak" + 0.004*"master" + 0.004*"door" + 0.004*"caemlyn"'),
 (1,
  '0.017*"egwene" + 0.017*"rand" + 0.016*"aiel" + 0.012*"aviendha" + 0.012*"wise" + 0.009*"ones" + 0.007*"amys" + 0.006*"moiraine" + 0.005*"rhuarc" + 0.004*"maidens"'),
 (2,
  '0.024*"egwene" + 0.022*"siuan" + 0.012*"gawyn" + 0.009*"bryne" + 0.007*"tower" + 0.006*"amyrlin" + 0.006*"sheriam" + 0.005*"lelaine" + 0.005*"romanda" + 0.005*"sisters"'),
 (3,
  '0.017*"rand" + 0.006*"lan" + 0.006*"moiraine" + 0.004*"stone" + 0.004*"sword" + 0.004*"perrin" + 0.003*"power" + 0.003*"black" + 0.003*"dead" + 0.003*"dragon"'),
 (4,
  '0.016*"moiraine" + 0.011*"nynaeve" + 0.009*"siuan" + 0.005*"moghedien" + 0.005*"tower" + 0.003*"room" + 0.003*"accepted" + 0.003*"lan" + 0.003*"blue" + 0.003*"day"'),
 (5,
  '0.019*"rand" + 0.010*"loial" + 0.010*"perrin" + 0.006*"trollocs" + 0.005*"ogier" + 0.005*"lord" + 0.005*"mat" + 0.004*"moiraine"

**Impressions**  
These topics are starting to look like the ones NMF came up with, but there still some that have no clearly defined theme or topic that stands out. For instance the first three don't give a clear topic.

### Conclusion

I will stick with the NMF results using Tf-IDF without bi-grams for the rest of my analysis.

### Combine dataframes

In [26]:
doc_topics_df['clean_text'] = chapters

metadata = pd.read_pickle('../data_files/extracted_text.pickle')

combined = metadata.merge(doc_topics_df, how='left', on='clean_text')

combined['word_count'] = combined['clean_text'].str.count(' ') + 1

combined.to_pickle('../data_files/result.pickle')

In [27]:
columns = ['book_title', 'book_nr', 'chapter_title', 'chapter_nr', 'word_count', 
           'rand_main', 'Nynaeve', 'mat', 'perrin_wolfbrother', 'egwene_amyrlin', 'loial_rand', 
           'siuan_moiraine', 'aviendha_aiel', 'elayne_queen', 'moiraine_lan', 'tuon_seanchan', 'puppeteers_main', 
           'min_viewings', 'black_tower', 'gawyn']
for_viz = combined[columns]

In [28]:
# Output to csv for use in Tableau visualization
for_viz.to_csv('../data_files/for_viz.csv')