In [None]:
import pandas as pd
import numpy as np 
import re
import time
import matplotlib.pyplot as plt
%matplotlib inline 

from nltk.corpus import reuters

import spacy

import gensim
from gensim import corpora
from gensim.models import LdaModel, LsiModel, CoherenceModel

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, PCA, TruncatedSVD

from wordcloud import WordCloud
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook


import warnings
warnings.simplefilter("ignore")

### read.md
The project had an iterative model development, thus we still have dependencies from cells to cells.
We recomend running the project sequencialy from the begging to the end, and if wanted adding the printing and the visualizatios functions(word cloud and pyLDAvis when needed). 

For future work the use of spacy pipelines, will be used.

PS - changes in models hyperparameters is possible (n_words, n_topic and tfidf params)

In [None]:
list_aux = []

for i in reuters.fileids():
    raw_doc =  reuters.raw(i)
    raw_cat =  reuters.categories(i)
    list_aux.append([i, raw_doc, raw_cat[0], len(raw_cat)])
      
df = pd.DataFrame(list_aux, columns=['id', 'raw_text', 'category', 'n_categories'])

#doc_inference = df['raw_text'].sample(10).astype(str)
doc_complete = df['raw_text'].astype(str)

#doc_complete = df['raw_text']

## Small Data Analysis

In [None]:
df.head(10)
df.n_categories.value_counts(normalize=True).plot(kind='bar')

In [None]:
df.category.value_counts()

In [None]:
cats = list(df.category.value_counts()[:10].keys())
#in case we want to use only X categories.

# Utils

In [None]:
def print_model(model, n_words, n_topics):
    results_df = pd.DataFrame([[word for rank, (word, prob) in enumerate(words)]
                               for topic_id, words in model.show_topics(formatted=False,
                                                        num_words=n_words, num_topics=n_topics)])
    print(results_df)
    print(results_df.to_latex())
    return(results_df)

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=doc_complete):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,2), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

# EXP.1

    PP - lower, strip(4), numerics, stopwords(git)
    tf - gensim
    lsa / lda gensim ( 10 topics )

In [None]:
start = time.time()

#Pre-Process
stoplist = "asian a a's able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am among amongst an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate are aren't around as aside ask asking associated at available away awfully b be became because become becomes becoming been before beforehand behind being believe below beside besides best better between beyond both brief but by c c'mon c's came can can't cannot cant cause causes certain certainly changes clearly co com come comes concerning consequently consider considering contain containing contains corresponding could couldn't course currently d definitely described despite did didn't different do does doesn't doing don't done down downwards during e each edu eg eight either else elsewhere enough entirely especially et etc even ever every everybody everyone everything everywhere ex exactly example except f far few fifth first five followed following follows for former formerly forth four from further furthermore g get gets getting given gives go goes going gone got gotten greetings h had hadn't happens hardly has hasn't have haven't having he he's hello help hence her here here's hereafter hereby herein hereupon hers herself hi him himself his hither hopefully how howbeit however i i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed indicate indicated indicates inner insofar instead into inward is isn't it it'd it'll it's its itself j just k keep keeps kept know knows known l last lately later latter latterly least less lest let let's like liked likely little look looking looks ltd m mainly many may maybe me mean meanwhile merely might more moreover most mostly much must my myself n name namely nd near nearly necessary need needs neither never nevertheless new next nine no nobody non none noone nor normally not nothing novel now nowhere o obviously of off often oh ok okay old on once one ones only onto or other others otherwise ought our ours ourselves out outside over overall own p particular particularly per perhaps placed please plus possible presumably probably provides q que quite qv r rather rd re really reasonably regarding regardless regards relatively respectively right s said same saw say saying says second secondly see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall she should shouldn't since six so some somebody somehow someone something sometime sometimes somewhat somewhere soon sorry specified specify specifying still sub such sup sure t t's take taken tell tends th than thank thanks thanx that that's thats the their theirs them themselves then thence there there's thereafter thereby therefore therein theres thereupon these they they'd they'll they're they've think third this thorough thoroughly those though three through throughout thru thus to together too took toward towards tried tries truly try trying twice two u un under unfortunately unless unlikely until unto up upon us use used useful uses using usually uucp v value various very via viz vs w want wants was wasn't way we we'd we'll we're we've welcome well went were weren't what what's whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who's whoever whole whom whose why will willing wish with within without won't wonder would would wouldn't x y yes yet you you'd you'll you're you've your yours yourself yourselves z zero"
stoplist = stoplist.split()

def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def pre_process(text):
    text = text.lower()

    text = gensim.parsing.preprocessing.strip_short(text, minsize=4)
    text = gensim.parsing.preprocessing.strip_non_alphanum(text)
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    text = text.split()
    text = [word for word in text if word not in stoplist]
    
    #text = remove_stop_words(text)
    #stemmer = gensim.parsing.porter.PorterStemmer()
    #text = stemmer.stem(text)
    return text


doc_clean = [pre_process(doc) for doc in doc_complete]
df['clean_text'] = doc_clean


print(round(time.time() - start), 'Docs. cleaned w/ EXP.1')
id2word = corpora.Dictionary(doc_clean)
corpus = [id2word.doc2bow(doc) for doc in doc_clean]

n_topics  = 10  # Variable to Change the n. of topics
n_words = 5  # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)

print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')

print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


print(round(time.time() - start), 'LDA Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity
coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_clean, 
                                     dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()  # Compute Coherence Score
print(round(time.time() - start), 'LDA Coherence: ', coherence_lda, '\n')


# EXP.1.1 to show the pre-pross in SK and the Models in Gensim
To convert a Sklearn Vectorizer to a Gensim corpus we used gensim.matutils.Sparse2Corpus (https://radimrehurek.com/gensim/matutils.html)

    1) PP - exp1 
    2) tf-idf - sklearn
    3) lsa / lda gensim ( 10 topics )

In [None]:
start = time.time()

print(round(time.time() - start), 'Starting Exp.3')
doc_clean = [pre_process(doc) for doc in doc_complete]
doc_clean = [" ".join(doc) for doc in doc_clean]
print(round(time.time() - start), 'Docs. cleaned w/ EXP.1 as a string!')


no_features = 10000  # SKLEARN TFIDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=100, 
                                   max_features=no_features, 
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(pd.Series(doc_clean))
corpus = gensim.matutils.Sparse2Corpus(tfidf, documents_columns=False)
#id2word = tfidf_vectorizer.vocabulary_
id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())


n_topics  = 10  # Variable to Change the n. of topics
n_words = 5 # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')


print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


print(round(time.time() - start), 'Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity
print(lda_results_df)
print(lsa_results_df)

# To Visualize in pyLDAvis

In [None]:
data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(data,'v3.html')

#print Word Clouds
for t in range(lda_model.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(lda_model.show_topic(t, 100))))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()

## EXP.2
Lematization based on: https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/

    1) PP - exp1 + lem com POS TAG(Nomes, adj, verbos, adv)
    2) tf - gensim
    3) lsa / lda gensim ( 10 topics )

In [None]:
start = time.time()

nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


#doc_clean = [pre_process(doc) for doc in doc_complete]
print(round(time.time() - start), 'Docs. cleaned w/ EXP.1')

doc_lemma =  lemmatization(doc_clean)
df['lem_pos_text'] = doc_lemma

print(round(time.time() - start), 'Docs. lemmatized and POS(w/ noun, adj, verb, adv)')

id2word = corpora.Dictionary(doc_lemma)
corpus = [id2word.doc2bow(doc) for doc in doc_lemma]

n_topics  = 10  # Variable to Change the n. of topics
n_words = 5  # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')


print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


#print(round(time.time() - start), 'Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity
#coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_lemma, dictionary=id2word, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()  # Compute Coherence Score
#print(round(time.time() - start), 'Coherence Score: ', coherence_lda, '\n')

# Exp. 2.1 - w/ TF-IDF(sklearn)
    1) PP - exp2
    2) tf-idf - sklearn
    3) lsa / lda - sklearn ( 10 topics )

In [None]:
start = time.time()

doc_lemma_sk = [" ".join(doc) for doc in doc_lemma]

# SKLEARN TFIDF
no_features = 10000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=100, 
                                   max_features=no_features, 
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(pd.Series(doc_lemma_sk))
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()

corpus = gensim.matutils.Sparse2Corpus(tfidf, documents_columns=False)
#id2word = tfidf_vectorizer.vocabulary_

id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())

n_topics  = 10  # Variable to Change the n. of topics
n_words = 5 # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')


print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


#print(round(time.time() - start), 'Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity

# Inference of this Model

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, 
                                                  corpus=corpus, texts=df['clean_text'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
#df_dominant_topic.head(10)

df['Infered_Topic'] = df_dominant_topic['Dominant_Topic']
df['Keywords_Topic'] = df_dominant_topic['Keywords']

pd.set_option('display.max_colwidth', 5)
df[['id', 'category', 'n_categories', 'Infered_Topic', 'Keywords_Topic', 'clean_text']].head(2).to

# EXP 3 - NER

    1) LEM + POS + NER + Stopwords 
    2) TF(gensin)
    3) lsa / lda - sklearn (10 topics)


In [None]:
start = time.time()

nlp = spacy.load('en', disable=['parser'])
stoplist = "a a's able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am among amongst an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate are aren't around as aside ask asking associated at available away awfully b be became because become becomes becoming been before beforehand behind being believe below beside besides best better between beyond both brief but by c c'mon c's came can can't cannot cant cause causes certain certainly changes clearly co com come comes concerning consequently consider considering contain containing contains corresponding could couldn't course currently d definitely described despite did didn't different do does doesn't doing don't done down downwards during e each edu eg eight either else elsewhere enough entirely especially et etc even ever every everybody everyone everything everywhere ex exactly example except f far few fifth first five followed following follows for former formerly forth four from further furthermore g get gets getting given gives go goes going gone got gotten greetings h had hadn't happens hardly has hasn't have haven't having he he's hello help hence her here here's hereafter hereby herein hereupon hers herself hi him himself his hither hopefully how howbeit however i i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed indicate indicated indicates inner insofar instead into inward is isn't it it'd it'll it's its itself j just k keep keeps kept know knows known l last lately later latter latterly least less lest let let's like liked likely little look looking looks ltd m mainly many may maybe me mean meanwhile merely might more moreover most mostly much must my myself n name namely nd near nearly necessary need needs neither never nevertheless new next nine no nobody non none noone nor normally not nothing novel now nowhere o obviously of off often oh ok okay old on once one ones only onto or other others otherwise ought our ours ourselves out outside over overall own p particular particularly per perhaps placed please plus possible -PRON- presumably probably provides q que quite qv r rather rd re really reasonably regarding regardless regards relatively respectively right s said same saw say saying says second secondly see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall she should shouldn't since six so some somebody somehow someone something sometime sometimes somewhat somewhere soon sorry specified specify specifying still sub such sup sure t t's take taken tell tends th than thank thanks thanx that that's thats the their theirs them themselves then thence there there's thereafter thereby therefore therein theres thereupon these they they'd they'll they're they've think third this thorough thoroughly those though three through throughout thru thus to together too took toward towards tried tries truly try trying twice two u un under unfortunately unless unlikely until unto up upon us use used useful uses using usually uucp v value various very via viz vs w want wants was wasn't way we we'd we'll we're we've welcome well went were weren't what what's whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who's whoever whole whom whose why will willing wish with within without won't wonder would would wouldn't x y yes yet you you'd you'll you're you've your yours yourself yourselves z zero"
stoplist = stoplist.split()

def ner(doc, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text = doc
    text = gensim.parsing.preprocessing.strip_tags(text)
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
    text = [word for word in text.split()]
    
    doc = nlp(" ".join(text))
    text = [token.lemma_ for token in doc 
                if token.pos_ in allowed_postags
                or token.ent_type_]
    
    text = [word for word in text if word not in stoplist]

    return text


doc_ner = [ner(doc) for doc in doc_complete]
df['ner_text'] = doc_ner
           
print(round(time.time() - start), 'Docs. cleaned w/ EXP. NER CLEANING')
#print(round(time.time() - start), 'Docs. lemmatized and POS(w/ noun, adj, verb, adv)')

id2word = corpora.Dictionary(doc_ner)
corpus = [id2word.doc2bow(doc) for doc in doc_ner]

n_topics  = 10  # Variable to Change the n. of topics
n_words = 5  # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')


print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


print(round(time.time() - start), 'Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity
coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_ner, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()  # Compute Coherence Score
print(round(time.time() - start), 'Coherence Score: ', coherence_lda, '\n')

# EXP 3.1 - NER + SKLEARN

In [None]:
df.head(10)

In [None]:
doc_ner_sk = [" ".join(doc) for doc in doc_ner]

# SKLEARN TFIDF
no_features = 10000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=100, 
                                   max_features=no_features, 
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(pd.Series(doc_ner_sk))
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()

corpus = gensim.matutils.Sparse2Corpus(tfidf, documents_columns=False)
#id2word = tfidf_vectorizer.vocabulary_
id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())


n_topics  = 10  # Variable to Change the n. of topics
n_words = 5 # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')


print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


#print(round(time.time() - start), 'Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, 
                                                  corpus=corpus, texts=df['ner_text'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
#df_dominant_topic.head(10)

df['Infered_Topic'] = df_dominant_topic['Dominant_Topic']
df['Keywords_Topic'] = df_dominant_topic['Keywords']

pd.set_option('display.max_colwidth', -1)
df[['id', 'category', 'n_categories', 'Infered_Topic', 'Keywords_Topic', 'clean_text']].head(2)

# EXP 4 - BIGRAMS + POS + NER 

In [None]:
start = time.time()

nlp = spacy.load('en', disable=['parser'])
stoplist = "000 & - a a's able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am among amongst an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate are aren't around as aside ask asking associated at available away awfully b be became because become becomes becoming been before beforehand behind being believe below beside besides best better between beyond both brief but by c c'mon c's came can can't cannot cant cause causes certain certainly changes clearly co com come comes concerning consequently consider considering contain containing contains corresponding could couldn't course currently d definitely described despite did didn't different do does doesn't doing don't done down downwards during e each edu eg eight either else elsewhere enough entirely especially et etc even ever every everybody everyone everything everywhere ex exactly example except f far few fifth first five followed following follows for former formerly forth four from further furthermore g get gets getting given gives go goes going gone got gotten greetings h had hadn't happens hardly has hasn't have haven't having he he's hello help hence her here here's hereafter hereby herein hereupon hers herself hi him himself his hither hopefully how howbeit however i i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed indicate indicated indicates inner insofar instead into inward is isn't it it'd it'll it's its itself j just k keep keeps kept know knows known l last lately later latter latterly least less lest let let's like liked likely little look looking looks ltd m mainly many may maybe me mean meanwhile merely might more moreover most mostly much must my myself n name namely nd near nearly necessary need needs neither never nevertheless new next nine no nobody non none noone nor normally not nothing novel now nowhere o obviously of off often oh ok okay old on once one ones only onto or other others otherwise ought our ours ourselves out outside over overall own p particular particularly per perhaps placed please plus possible -PRON- presumably probably provides q que quite qv r rather rd re really reasonably regarding regardless regards relatively respectively right s said same saw say saying says second secondly see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall she should shouldn't since six so some somebody somehow someone something sometime sometimes somewhat somewhere soon sorry specified specify specifying still sub such sup sure t t's take taken tell tends th than thank thanks thanx that that's thats the their theirs them themselves then thence there there's thereafter thereby therefore therein theres thereupon these they they'd they'll they're they've think third this thorough thoroughly those though three through throughout thru thus to together too took toward towards tried tries truly try trying twice two u un under unfortunately unless unlikely until unto up upon us use used useful uses using usually uucp v value various very via viz vs w want wants was wasn't way we we'd we'll we're we've welcome well went were weren't what what's whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who's whoever whole whom whose why will willing wish with within without won't wonder would would wouldn't x y yes yet you you'd you'll you're you've your yours yourself yourselves z zero"
stoplist = stoplist.split()

bigram = gensim.models.Phrases(doc_complete, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

def bi_ner(doc, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text = doc
    text = gensim.parsing.preprocessing.strip_tags(text)
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
    
    text = [word for word in text.split()]    
    text = [word for word in text if word not in stoplist]
    
    text = bigram_mod[text]
    
    doc = nlp(" ".join(text))
    text = [token.lemma_ for token in doc 
                if token.pos_ in allowed_postags
                or token.ent_type_]
    return text


doc_bi_ner = [bi_ner(doc) for doc in doc_complete]
df['bi_ner_text'] = doc_bi_ner

print(round(time.time() - start), 'Docs. cleaned w/ EXP. NER CLEANING')
#print(round(time.time() - start), 'Docs. lemmatized and POS(w/ noun, adj, verb, adv)')

id2word = corpora.Dictionary(doc_bi_ner)
corpus = [id2word.doc2bow(doc) for doc in doc_bi_ner]

n_topics  = 10  # Variable to Change the n. of topics
n_words = 5  # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')


print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


print(round(time.time() - start), 'Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity
#coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_ner, dictionary=id2word, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()  # Compute Coherence Score
#print(round(time.time() - start), 'Coherence Score: ', coherence_lda, '\n')

In [None]:
data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(data,'vlast.html')

#print Word Clouds
for t in range(lda_model.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(lda_model.show_topic(t, 100))))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()

# EXP 4.1 - BIGRAMS + POS + NER + TFIDF(sk)


In [None]:
start = time.time()

doc_ner_sk = [" ".join(doc) for doc in doc_ner]
# SKLEARN TFIDF
no_features = 10000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=100, 
                                   max_features=no_features, 
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(pd.Series(doc_ner_sk))
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()

corpus = gensim.matutils.Sparse2Corpus(tfidf, documents_columns=False)
#id2word = tfidf_vectorizer.vocabulary_
id2word = dict((v, k) for k, v in tfidf_vectorizer.vocabulary_.items())


n_topics  = 10  # Variable to Change the n. of topics
n_words = 5 # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')


print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)

# Exp. 5 - SKLEARN LDA
    1) PP - exp1
    2) tf-idf - sklearn
    3) lsa / lda - sklearn ( 10 topics )

In [None]:
start = time.time()

print(round(time.time() - start), 'Starting Exp.3')
doc_clean = [pre_process(doc) for doc in doc_complete]
doc_clean = [" ".join(doc) for doc in doc_clean]
print(round(time.time() - start), 'Docs. cleaned w/ EXP.1 as a string!')

n_topics  = 10  # Variable to Change the n. of topics
n_words = 5  # Represents the nº of words to show

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )


data_vectorized = vectorizer.fit_transform(doc_clean)
lda_model = LatentDirichletAllocation(n_topics=n_topics,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1)      # compute perplexity

lda_output = lda_model.fit_transform(data_vectorized)

def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=n_words) 

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

# Exp. BASELINE

In [None]:
start = time.time()

def pre_process(text):
    text = text.lower()
    text = gensim.parsing.preprocessing.strip_non_alphanum(text)
    text = text.split()
    
    return text


doc_clean = [pre_process(doc) for doc in doc_complete]


print(round(time.time() - start), 'Docs. cleaned w/ EXP.1')
id2word = corpora.Dictionary(doc_clean)
corpus = [id2word.doc2bow(doc) for doc in doc_clean]

n_topics  = 10  # Variable to Change the n. of topics
n_words = 5  # Represents the nº of words to show

print(round(time.time() - start), 'Creating LDA and LSA Models w/ Preprossesing EXP.1.')
lda_model = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)
lsa_model = LsiModel(corpus=corpus, num_topics=n_topics, id2word=id2word)

print(round(time.time() - start), 'Models Created w/', n_topics, 'Topics.')

print(round(time.time() - start), 'LDA Results in DF format. \n')
print_model(lda_model, n_words, n_topics)
print(round(time.time() - start), 'LSA Results in DF format. \n')
print_model(lsa_model, n_words, n_topics)


print(round(time.time() - start), 'LDA Perplexity: ', lda_model.log_perplexity(corpus))  # Compute Perplexity
coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_clean, 
                                     dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()  # Compute Coherence Score
print(round(time.time() - start), 'LDA Coherence: ', coherence_lda, '\n')