In [None]:
import nltk
import docx2txt
import gensim
import spacy
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [None]:
%matplotlib inline

In [None]:
text = docx2txt.process("test.docx")


In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
tokens=tokenizer.tokenize(text)


In [None]:
data_words=[tokens]
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=40) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=40)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


In [None]:
stop_words=set(stopwords.words('english'))
new_set=['etc','activity',,'note','call','meeting','author','strategy','strategies','fund','lp','long','short',
        'manager','type','market','markets','vs','said','would','since','within','across','ytd','time','month','year',
         'article','according','get','go','mtd','gp','dm','author_effective_date_activity','e_g','absolute_return_institutional','fund_lp',
        'absolute','return_institutional_fund_lp','2019_sector_multi_strategy','sub_strategy','non_pass_through_rv',
        'multi_strategy','strategy_relative_value','absolute_return','research','30_2019_sector_multi','stock_selection','update','business','use','look','need']
stop_words.update(new_set)
factor_asset=['multi', 'value', 'return', 'risk', 'absolute', 'rv', 'non', 'relative', 'institutional', 'sub', 'sector', 'pass', 'equity', 'stock', 'selection', 'gaa', 'data', 'developed', 'global', 'date', 'performance', 'gross', 'effective', 'futures', 'arbitrage', 'management', 'trading', 'date', 'performance', 'gross', 'effective', 'futures', 'arbitrage', 'management', 
              'office','e','trading','g','etc','qtd','aum','nav','end','bn','billion','q1','q2','q3','q4','team','us','em','emerging','developed','looking','firm','fixed','income','credit','also','fx','bond','go','going','likely','like']
month=['jan','january','feb','february','mar','march','apr','april','may','jun','june','jul','july','aug','august','sep','sept','september','oct','october','nov','november','dec','december']
number=['first','two','three','one']
stop_words.update(factor_asset)
stop_words.update(month)
stop_words.update(number)


In [None]:
tri=trigram_mod[bigram_mod[data_words]]

data_lemmatized = lemmatization(tri, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
tokens=[x for sublist in data_lemmatized for x in sublist]
tokens=[x.casefold() for x in tokens]
stopped = [w for w in tokens if not w in stop_words]
nonum=[item for item in stopped if not item.isdigit()]


In [None]:
freq = nltk.FreqDist(nonum)
for key,val in freq.items():
    print(str(key) + ':' + str(val))


In [None]:
freq.plot(20, cumulative=False)

In [None]:
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
from wordcloud import WordCloud, STOPWORDS
import gensim.corpora as corpora

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

# Build LDA model
data_ready = [nonum]
id2word = corpora.Dictionary(data_ready)
corpus = [id2word.doc2bow(text) for text in data_ready]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=1, 
                                           random_state=30,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=10,
                                           per_word_topics=True)

print(lda_model.print_topics())



topics = lda_model.show_topics(formatted=False)
fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

