In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from gensim import corpora, models, similarities, matutils
from data_extract_clean import stem_tokenizer
from analysis_functions import display_topics
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
chapters = pd.read_pickle('../data_files/chapter_corpus.pickle')

In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['becaus', 'said'])
porter_stemmer = PorterStemmer()
stop_words_stemmed = [porter_stemmer.stem(word) for word in stop_words]

## Comparing matrix size

Using my local machine I ran into problems when the Document-Term matrix size became too large. This notebook explored the size of different models and combinations of parameters to see what would be effective in reducing matrix size while keeping sensible topics for modeling.

### Baseline

In [4]:
# Chapters. Basic Countvectorizer, no stemming, remove stop words, uni-grams only
cv1 = CountVectorizer(stop_words=stop_words, ngram_range=(1, 1))
chapters_cv1 = cv1.fit_transform(chapters)
chapters_dtm = pd.DataFrame(chapters_cv1.toarray(), columns=cv1.get_feature_names())

In [5]:
chapters_dtm.shape

(704, 48173)

In [6]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(chapters_dtm)

display_topics(nmf_model, cv1.get_feature_names(), 10)


Topic  0
would, could, one, elayne, woman, even, nynaeve, might, sedai, aes

Topic  1
would, one, mat, could, back, men, trollocs, demandred, man, like

Topic  2
rand, one, could, mat, back, would, like, even, man, head

Topic  3
perrin, would, faile, could, one, men, two, back, man, like

Topic  4
egwene, would, one, tower, sedai, siuan, could, aes, amyrlin, elaida


In [7]:
# Chapters. Same as cv1, but with bi-grams
cv2 = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2))
chapters_cv2 = cv2.fit_transform(chapters)
chapters_dtm2 = pd.DataFrame(chapters_cv2.toarray(), columns=cv2.get_feature_names())

In [8]:
chapters_dtm2.shape

(704, 1304744)

In [9]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(chapters_dtm)

display_topics(nmf_model, cv1.get_feature_names(), 10)


Topic  0
would, could, one, elayne, woman, even, nynaeve, might, sedai, aes

Topic  1
would, one, mat, could, back, men, trollocs, demandred, man, like

Topic  2
rand, one, could, mat, back, would, like, even, man, head

Topic  3
perrin, would, faile, could, one, men, two, back, man, like

Topic  4
egwene, would, one, tower, sedai, siuan, could, aes, amyrlin, elaida


### stemming

In [10]:
# Chapters. cv1 with stemming
cv1 = CountVectorizer(stop_words=stop_words_stemmed, ngram_range=(1, 1), tokenizer=stem_tokenizer)
chapters_cv1 = cv1.fit_transform(chapters)
chapters_dtm = pd.DataFrame(chapters_cv1.toarray(), columns=cv1.get_feature_names())

In [11]:
chapters_dtm.shape

(704, 33543)

In [12]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(chapters_dtm)
display_topics(nmf_model, cv1.get_feature_names(), 10)


Topic  0
would, could, one, elayn, woman, nynaev, even, like, hand, might

Topic  1
would, one, could, mat, trolloc, back, demandr, men, fight, look

Topic  2
rand, mat, one, could, look, back, would, like, eye, man

Topic  3
perrin, would, one, fail, could, men, man, like, look, two

Topic  4
egwen, would, one, siuan, tower, sedai, could, amyrlin, ae, elaida


### CV parameters

In [13]:
# Chapters. cv1 with min_df
cv1 = CountVectorizer(stop_words=stop_words, ngram_range=(1, 1), min_df=2)
chapters_cv1 = cv1.fit_transform(chapters)
chapters_dtm = pd.DataFrame(chapters_cv1.toarray(), columns=cv1.get_feature_names())

In [14]:
chapters_dtm.shape

(704, 25804)

In [15]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(chapters_dtm)
display_topics(nmf_model, cv1.get_feature_names(), 10)


Topic  0
would, could, one, elayne, woman, even, nynaeve, might, sedai, aes

Topic  1
would, one, mat, could, back, men, trollocs, demandred, man, like

Topic  2
rand, one, could, mat, back, would, like, even, man, head

Topic  3
perrin, would, faile, could, one, men, two, back, man, like

Topic  4
egwene, would, one, tower, sedai, siuan, could, aes, amyrlin, elaida


In [16]:
# Chapters. cv1 with max_df
cv1 = CountVectorizer(stop_words=stop_words, ngram_range=(1, 1), max_df=0.9)
chapters_cv1 = cv1.fit_transform(chapters)
chapters_dtm = pd.DataFrame(chapters_cv1.toarray(), columns=cv1.get_feature_names())

In [17]:
chapters_dtm.shape

(704, 48070)

In [18]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(chapters_dtm)
display_topics(nmf_model, cv1.get_feature_names(), 10)


Topic  0
mat, trollocs, demandred, battle, toward, bloody, elayne, heights, ground, air

Topic  1
elayne, nynaeve, birgitte, aviendha, room, tower, sure, went, hair, door

Topic  2
rand, min, aiel, moiraine, lord, dragon, mat, stone, rands, sword

Topic  3
perrin, faile, lord, aiel, camp, wolves, rivers, wolf, young, gaul

Topic  4
egwene, tower, siuan, amyrlin, elaida, sisters, hall, mother, gawyn, ajah


In [19]:
# Chapters. cv1 with min_df and max_df
cv1 = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.9)
chapters_cv1 = cv1.fit_transform(chapters)
chapters_dtm = pd.DataFrame(chapters_cv1.toarray(), columns=cv1.get_feature_names())

In [20]:
chapters_dtm.shape

(704, 25587)

In [21]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(chapters_dtm)
display_topics(nmf_model, cv1.get_feature_names(), 10)


Topic  0
mat, trollocs, demandred, battle, bloody, elayne, heights, ground, air, fight

Topic  1
egwene, tower, siuan, amyrlin, elaida, sisters, hall, mother, gawyn, ajah

Topic  2
rand, min, aiel, moiraine, lord, dragon, stone, rands, sword, mat

Topic  3
perrin, faile, lord, aiel, camp, wolves, rivers, wolf, gaul, young

Topic  4
elayne, nynaeve, birgitte, aviendha, room, tower, sure, went, door, hair


### All

In [22]:
# Chapters. cv1 with stemming, min_df, max_df
cv1 = CountVectorizer(stop_words=stop_words_stemmed, ngram_range=(1, 1), tokenizer=stem_tokenizer, min_df=2, max_df=0.9)
chapters_cv1 = cv1.fit_transform(chapters)
chapters_dtm = pd.DataFrame(chapters_cv1.toarray(), columns=cv1.get_feature_names())

In [23]:
chapters_dtm.shape

(704, 15707)

In [24]:
nmf_model = NMF(20)
doc_topic = nmf_model.fit_transform(chapters_dtm)
display_topics(nmf_model, cv1.get_feature_names(), 15)


Topic  0
trolloc, demandr, fight, mat, sharan, battl, height, power, galad, channel, toward, tam, ground, weav, air

Topic  1
lord, dragon, morgas, great, niall, year, althor, name, command, aiel, queen, andor, captain, world, high

Topic  2
rand, dragon, therin, lew, loial, sword, lord, citi, basher, maiden, tam, saidin, hurin, taim, step

Topic  3
perrin, fail, wolv, wolf, smell, gaul, hopper, whitecloak, slayer, elya, camp, river, scent, lord, berelain

Topic  4
egwen, amyrlin, tower, elaida, dream, novic, mother, verin, ajah, sitter, hall, sheriam, black, oath, meidani

Topic  5
mat, thom, bloodi, dice, villag, talman, inn, noal, pull, mayb, glanc, ill, room, burn, coin

Topic  6
nynaev, elayn, moghedien, channel, thom, lan, rememb, help, juilin, sheriam, braid, dress, weav, mouth, heal

Topic  7
elayn, birgitt, aviendha, dyelin, andor, queen, caemlyn, ladi, throne, smile, sister, palac, merilil, mother, windfind

Topic  8
siuan, sister, lean, bryne, myrel, sheriam, amyrlin, lelai



In [25]:
doc_topic.shape

(704, 20)

### Try tf-idf

In [26]:
# Chapters. Tf-IDF with stemming, min_df, max_df
tf1 = TfidfVectorizer(stop_words=stop_words_stemmed, ngram_range=(1, 1), tokenizer=stem_tokenizer, min_df=2, max_df=0.9)
chapters_tf1 = tf1.fit_transform(chapters)
chapters_dtm_tf = pd.DataFrame(chapters_tf1.toarray(), columns=tf1.get_feature_names())

In [27]:
chapters_dtm_tf.shape

(704, 15707)

In [28]:
nmf_model = NMF(15)
doc_topic = nmf_model.fit_transform(chapters_dtm_tf)
display_topics(nmf_model, tf1.get_feature_names(), 15)


Topic  0
rand, therin, lew, basher, dragon, lord, tam, taim, saidin, maiden, sword, citi, aiel, weiramon, kill

Topic  1
nynaev, elayn, moghedien, thom, juilin, liandrin, luca, dream, telaranrhiod, door, channel, braid, galad, ship, tanchico

Topic  2
mat, thom, bloodi, talman, dice, olver, noal, gleeman, gholam, vanin, nalesean, cauthon, inn, mayb, tylin

Topic  3
perrin, fail, berelain, gaul, hopper, wolv, elya, masema, slayer, wolf, aram, galad, whitecloak, smell, lord

Topic  4
egwen, amyrlin, sheriam, elaida, tower, verin, novic, sitter, ajah, silviana, romanda, hall, dream, sister, mother

Topic  5
loial, hurin, rand, ingtar, ogier, verin, selen, horn, fain, lord, sted, waygat, trolloc, sniffer, perrin

Topic  6
siuan, moirain, sister, sheriam, bryne, elaida, myrel, amyrlin, lelain, tower, novic, romanda, sitter, lean, tamra

Topic  7
aviendha, aiel, wise, ami, rhuarc, maiden, bair, spear, chief, couladin, shaido, clan, melain, gaishain, rhuidean

Topic  8
elayn, birgitt, aviend

### Conclusion

It looks like using min_df = 2 and max_df = 0.9 creates a manageable doc_term matrix. It tunes to sensible topics, especially using Tf-IDF. Using bigrams does not impact the quality of topics at all, so I will not be using them