In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
import os, glob
import pandas as pd
from collections import defaultdict
from pathlib import Path
import pandas as df
import pyLDAvis
import pyLDAvis.gensim

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/raulbag/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/raulbag/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
files_folder = f'./full_text/cleaned/cleaned_text'

results = defaultdict(list)
for file in Path(files_folder).glob('**/*.txt'):
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["text"].append(file_open.read())
df = pd.DataFrame(results)

data_text = df[['text']]
data_text['index'] = data_text.index
documents = data_text

In [3]:
stemmer = SnowballStemmer(language='english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
doc_sample = documents[documents['index'] == 10].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['', 'testing', 'continuous', 'time', 'models', 'financial', 'markets', 'a', 'zur', 'erlangung', 'des', 'akademischen', 'grades', 'doctor', 'rerum', 'politicarum', 'rer', 'pol', 'fach', 'statistik', 'und', 'okonometrie', 'eingereicht', 'der', 'wirtschaftswissenschaftlichen', 'fakult', 'humboldt', 'universit', 'berlin', 'von', 'herrn', 'dipl', 'math', 'torsten', 'kleinow', 'geborem', 'potsdam', 'asident', 'der', 'humboldt', 'universit', 'berlin', 'prof', 'urgen', 'mlynek', 'dekan', 'der', 'wirtschaftswissenschaftlichen', 'fakult', 'prof', 'burda', 'gutachter', 'prof', 'wolfgang', 'ardle', 'priv', 'doz', 'helmut', 'herwartz', 'eingereicht', 'mai', 'tag', 'der', 'undlichen', 'ufung', 'juli', 'abstract', 'the', 'aim', 'the', 'thesis', 'provide', 'wide', 'range', 'statistical', 'methods', 'designed', 'test', 'parametric', 'assumptions', 'about', 'the', 'evolution', 'continuous', 'time', 'processes', 'nancial', 'markets', 'the', 'main', 'focus', 'the', 'statistical', 'met

['test', 'continu', 'time', 'model', 'financi', 'market', 'erlangung', 'akademischen', 'grade', 'doctor', 'rerum', 'politicarum', 'fach', 'statistik', 'okonometri', 'eingereicht', 'fakult', 'humboldt', 'universit', 'berlin', 'herrn', 'dipl', 'math', 'torsten', 'kleinow', 'geborem', 'potsdam', 'asid', 'humboldt', 'universit', 'berlin', 'prof', 'urgen', 'mlynek', 'dekan', 'fakult', 'prof', 'burda', 'gutacht', 'prof', 'wolfgang', 'ardl', 'priv', 'helmut', 'herwartz', 'eingereicht', 'undlichen', 'ufung', 'juli', 'abstract', 'thesi', 'provid', 'wide', 'rang', 'statist', 'method', 'design', 'test', 'parametr', 'assumpt', 'evolut', 'continu', 'time', 'process', 'nancial', 'market', 'main', 'focus', 'statist', 'methodolog', 'investig', 'properti', 'propos', 'method', 'appli', 'nite', 'sampl', 'aspect', 'particular', 'import', 'empir', 'applic', 'chapter', 'includ', 'empir', 'analysi', 'nancial', 'data', 'develop', 'method', 'keyword', 'mathemat', 'financ', 'statist', 'test', 'usion', 'process'

In [5]:
processed_docs = documents['text'].map(preprocess)
processed_docs[:10]

0    [adapt, method, risk, calibr, dissert, erlangu...
1    [dynam, cluster, visual, smart, data, applic, ...
2    [essay, learn, statist, implement, statist, so...
3    [model, financi, social, network, erlangung, a...
4    [rtat, erlangung, akademischen, grade, doctor,...
5    [statist, digit, financ, erlangung, akademisch...
6    [function, data, analysi, applic, financ, diss...
7    [weather, risk, manag, bond, weather, deriv, e...
8    [valuat, properti, econom, model, real, estat,...
9    [tail, event, drive, financi, risk, model, erl...
Name: text, dtype: object

In [6]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abbrevi
1 abil
2 abl
3 absolut
4 abus
5 accept
6 access
7 accord
8 account
9 accumu
10 accumul


In [7]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(1, 2),
 (2, 6),
 (3, 1),
 (7, 9),
 (8, 2),
 (10, 1),
 (11, 1),
 (12, 3),
 (13, 4),
 (14, 2),
 (23, 9),
 (24, 1),
 (26, 12),
 (27, 2),
 (29, 7),
 (32, 1),
 (34, 4),
 (37, 2),
 (45, 1),
 (48, 1),
 (49, 8),
 (52, 3),
 (53, 22),
 (58, 2),
 (59, 64),
 (61, 1),
 (62, 2),
 (65, 1),
 (66, 1),
 (67, 5),
 (70, 11),
 (71, 17),
 (72, 82),
 (73, 26),
 (76, 23),
 (77, 4),
 (78, 54),
 (80, 13),
 (81, 8),
 (85, 23),
 (86, 2),
 (88, 2),
 (90, 5),
 (91, 7),
 (92, 4),
 (96, 1),
 (99, 1),
 (100, 3),
 (101, 22),
 (103, 4),
 (104, 35),
 (105, 43),
 (110, 72),
 (112, 1),
 (123, 20),
 (124, 7),
 (125, 8),
 (126, 15),
 (127, 3),
 (132, 2),
 (134, 28),
 (137, 45),
 (138, 5),
 (140, 60),
 (141, 2),
 (142, 6),
 (145, 1),
 (146, 1),
 (147, 5),
 (153, 1),
 (155, 1),
 (157, 10),
 (158, 2),
 (160, 4),
 (161, 5),
 (162, 1),
 (163, 13),
 (164, 1),
 (168, 3),
 (170, 5),
 (171, 3),
 (173, 7),
 (174, 1),
 (176, 8),
 (182, 56),
 (183, 16),
 (184, 5),
 (186, 1),
 (189, 2),
 (190, 31),
 (192, 6),
 (195, 2),
 (198, 17),
 (2

In [8]:
bow_doc_10 = bow_corpus[10]
for i in range(len(bow_doc_10)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_10[i][0], 
                                               dictionary[bow_doc_10[i][0]], 
bow_doc_10[i][1]))

Word 1 ("abil") appears 2 time.
Word 2 ("abl") appears 6 time.
Word 3 ("absolut") appears 1 time.
Word 7 ("accord") appears 9 time.
Word 8 ("account") appears 2 time.
Word 10 ("accumul") appears 1 time.
Word 11 ("accur") appears 1 time.
Word 12 ("accuraci") appears 3 time.
Word 13 ("achiev") appears 4 time.
Word 14 ("acknowledg") appears 2 time.
Word 23 ("adapt") appears 9 time.
Word 24 ("add") appears 1 time.
Word 26 ("addit") appears 12 time.
Word 27 ("address") appears 2 time.
Word 29 ("adjust") appears 7 time.
Word 32 ("adopt") appears 1 time.
Word 34 ("advantag") appears 4 time.
Word 37 ("advisor") appears 2 time.
Word 45 ("akademischen") appears 1 time.
Word 48 ("algebra") appears 1 time.
Word 49 ("algorithm") appears 8 time.
Word 52 ("allow") appears 3 time.
Word 53 ("altern") appears 22 time.
Word 58 ("analys") appears 2 time.
Word 59 ("analysi") appears 64 time.
Word 61 ("analyt") appears 1 time.
Word 62 ("analyz") appears 2 time.
Word 65 ("andigkeitserkl") appears 1 time.
Wor

In [9]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.003248349864707201),
 (1, 0.008316246912457506),
 (2, 0.0011769728615134876),
 (3, 0.00041193825800108413),
 (4, 0.004212435475679623),
 (5, 0.006490338899663914),
 (6, 0.0009985136768713714),
 (8, 0.0002496515575077928),
 (9, 0.008300045172605349),
 (10, 0.010035812419483505),
 (11, 0.00016813898021621253),
 (12, 0.0031936688315450357),
 (13, 0.0016813898021621253),
 (14, 0.00030370962049230824),
 (15, 0.0068440092204679615),
 (16, 0.0022284844527887928),
 (17, 0.0002496515575077928),
 (18, 0.005992282788765764),
 (19, 0.00021236410522117952),
 (20, 0.008300045172605349),
 (21, 0.008424870951359246),
 (22, 0.008424870951359246),
 (23, 0.029617143783722603),
 (24, 0.0006043095204351917),
 (25, 0.007369040809852362),
 (27, 0.0001248257787538964),
 (28, 0.016600090345210698),
 (29, 0.0002471629548006505),
 (30, 0.1826009937973177),
 (31, 0.0018069524097712188),
 (32, 0.0042301666430463415),
 (33, 0.13280072276168559),
 (34, 0.00016477530320043365),
 (35, 0.0021149099909579107),
 (

In [None]:
lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=10, alpha='symmetric',eta=0.61)

In [11]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.000*"expectil" + 0.000*"temperatur" + 0.000*"teda" + 0.000*"crix" + 0.000*"differ" + 0.000*"erent" + 0.000*"bitcoin" + 0.000*"crypto" + 0.000*"lemma" + 0.000*"effect"
Topic: 1 
Words: 0.000*"expectil" + 0.000*"temperatur" + 0.000*"copula" + 0.000*"seme" + 0.000*"surrog" + 0.000*"crypto" + 0.000*"differ" + 0.000*"pvar" + 0.000*"teda" + 0.000*"bitcoin"
Topic: 2 
Words: 0.000*"mortal" + 0.000*"lpxbhr" + 0.000*"prune" + 0.000*"teda" + 0.000*"tree" + 0.000*"treatment" + 0.000*"fertil" + 0.000*"comdti" + 0.000*"wind" + 0.000*"temperatur"
Topic: 3 
Words: 0.000*"mortal" + 0.000*"lpxbhr" + 0.000*"prune" + 0.000*"teda" + 0.000*"tree" + 0.000*"treatment" + 0.000*"fertil" + 0.000*"comdti" + 0.000*"wind" + 0.000*"temperatur"
Topic: 4 
Words: 0.000*"mortal" + 0.000*"lpxbhr" + 0.000*"prune" + 0.000*"teda" + 0.000*"treatment" + 0.000*"tree" + 0.000*"fertil" + 0.000*"wind" + 0.000*"comdti" + 0.000*"japan"


In [13]:
perplexity = lda_model.log_perplexity(bow_corpus)



#printing model perplexity

print(perplexity)

-10.696060679397752


In [10]:
corpus = gensim.utils.ClippedCorpus(corpus_tfidf, int(len(corpus_tfidf)*0.75))

lda_model_tfidf = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=3, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha= 'symmetric',
                                           eta=0.61)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.000*"expectil" + 0.000*"wind" + 0.000*"mortal" + 0.000*"cryptopunk" + 0.000*"artwork" + 0.000*"uplift" + 0.000*"temperatur" + 0.000*"treatment" + 0.000*"client" + 0.000*"hedon"
Topic: 1 Word: 0.000*"teda" + 0.000*"copula" + 0.000*"lpxbhr" + 0.000*"pvar" + 0.000*"news" + 0.000*"prune" + 0.000*"portfolio" + 0.000*"letf" + 0.000*"club" + 0.000*"effect"
Topic: 2 Word: 0.000*"yaml" + 0.000*"phil" + 0.000*"bitcoin" + 0.000*"lemma" + 0.000*"faculti" + 0.000*"crypto" + 0.000*"blockchain" + 0.000*"vali" + 0.000*"citat" + 0.000*"surrog"


In [20]:
perplexity = lda_model_tfidf.log_perplexity(corpus_tfidf)



#printing model perplexity

print(perplexity)

-11.087542992512834


In [13]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score Simple: ', coherence_lda)

Coherence Score Simple:  0.5357200179625166


In [14]:
coherence_model_lda_tfidf = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score Simple: ', coherence_lda)

Coherence Score Simple:  0.5357200179625166


In [15]:
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
p

In [12]:
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
pyLDAvis.save_html(p, './results/ldavis_tuned_3topics.html')
p

<h1>Hyperparameter Tuning</h1>

In [16]:
# supporting function
def compute_coherence_perplexity_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence(), lda_model.log_perplexity(corpus)

In [17]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(bow_corpus)
corpus_sets = [gensim.utils.ClippedCorpus(bow_corpus, int(num_of_docs*0.75)), 
               bow_corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': [],
                 'Perplexity': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv, perplexity = compute_coherence_perplexity_values(corpus=corpus_sets[i], dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    model_results['Perplexity'].append(perplexity)
                    
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()


  0%|                                                   | 0/540 [01:12<?, ?it/s][A

  0%|                                         | 1/540 [00:26<3:59:14, 26.63s/it][A
  0%|▏                                        | 2/540 [00:53<4:00:22, 26.81s/it][A
  1%|▏                                        | 3/540 [01:22<4:07:39, 27.67s/it][A
  1%|▎                                        | 4/540 [01:49<4:03:56, 27.31s/it][A
  1%|▍                                        | 5/540 [02:15<4:01:40, 27.10s/it][A
  1%|▍                                        | 6/540 [02:42<3:59:35, 26.92s/it][A
  1%|▌                                        | 7/540 [03:09<3:59:52, 27.00s/it][A
  1%|▌                                        | 8/540 [03:36<3:59:22, 27.00s/it][A
  2%|▋                                        | 9/540 [04:03<3:58:51, 26.99s/it][A
  2%|▋                                       | 10/540 [04:30<3:58:06, 26.96s/it][A
  2%|▊                                       | 11/540 [04:57<3:57:20, 26.9

 36%|█████████████▎                       | 194/540 [1:37:33<3:14:46, 33.78s/it][A
 36%|█████████████▎                       | 195/540 [1:38:06<3:13:33, 33.66s/it][A
 36%|█████████████▍                       | 196/540 [1:38:39<3:11:55, 33.48s/it][A
 36%|█████████████▍                       | 197/540 [1:39:13<3:12:22, 33.65s/it][A
 37%|█████████████▌                       | 198/540 [1:39:47<3:12:05, 33.70s/it][A
 37%|█████████████▋                       | 199/540 [1:40:21<3:12:17, 33.83s/it][A
 37%|█████████████▋                       | 200/540 [1:40:55<3:12:15, 33.93s/it][A
 37%|█████████████▊                       | 201/540 [1:41:29<3:10:16, 33.68s/it][A
 37%|█████████████▊                       | 202/540 [1:42:02<3:10:02, 33.74s/it][A
 38%|█████████████▉                       | 203/540 [1:42:36<3:09:26, 33.73s/it][A
 38%|█████████████▉                       | 204/540 [1:43:10<3:08:38, 33.69s/it][A
 38%|██████████████                       | 205/540 [1:43:43<3:07:29, 33.58s

 72%|██████████████████████████▌          | 388/540 [3:19:07<1:19:14, 31.28s/it][A
 72%|██████████████████████████▋          | 389/540 [3:19:38<1:18:23, 31.15s/it][A
 72%|██████████████████████████▋          | 390/540 [3:20:09<1:18:02, 31.22s/it][A
 72%|██████████████████████████▊          | 391/540 [3:20:42<1:19:16, 31.93s/it][A
 73%|██████████████████████████▊          | 392/540 [3:21:15<1:19:33, 32.25s/it][A
 73%|██████████████████████████▉          | 393/540 [3:21:48<1:19:32, 32.46s/it][A
 73%|██████████████████████████▉          | 394/540 [3:22:21<1:19:20, 32.61s/it][A
 73%|███████████████████████████          | 395/540 [3:22:55<1:19:13, 32.78s/it][A
 73%|███████████████████████████▏         | 396/540 [3:23:28<1:19:17, 33.04s/it][A
 74%|███████████████████████████▏         | 397/540 [3:24:01<1:18:54, 33.11s/it][A
 74%|███████████████████████████▎         | 398/540 [3:24:34<1:18:15, 33.07s/it][A
 74%|███████████████████████████▎         | 399/540 [3:25:08<1:17:43, 33.07s

In [30]:
new_df = pd.read_csv('./results/lda_tuning_results.csv')
new_df.max()

Validation_Set    75% Corpus
Topics                    10
Alpha              symmetric
Beta               symmetric
Coherence            0.57742
dtype: object