In [1]:
import pandas as pd
import os
os.chdir("C:\\Siddhartha\\Confidential Documents\\Data Science\\MT 599\\")
dataset=pd.read_csv("MT599.csv")

In [2]:
# import dependencies
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import FreqDist

import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
from gensim.models import CoherenceModel

import re

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [3]:
# Splitting the dataset into the Training set and Test set
df=pd.DataFrame(dataset["Text"])

In [4]:
# document length
df['TextLength'] = df['Text'].str.len()

In [5]:
mean = np.average(df['TextLength'])
sd = np.std(df['TextLength'])

data = df[df['Text'].map(len) >= mean-2*sd]
data = data[data['Text'].map(len) <= mean+2*sd]

In [6]:
def initial_clean(text):
    """
    Function to clean text of websites, email addresess and any punctuation
    We also lower case the text
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", " ", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

stop_words = pd.read_csv("stop word list - V1.csv")
stop_words = stop_words['i'].tolist()
def remove_stop_words(text):
    """
    Function that removes all stopwords from text
    """
    return [word for word in text if word not in stop_words]


stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    text = [stemmer.stem(word) for word in text]
    text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    
    return text

def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return remove_stop_words(stem_words(remove_stop_words(initial_clean(text))))

In [7]:
# clean text and title and create new column "tokenized"
data['tokenized'] = data['Text'].apply(apply_all)

In [8]:
# first get a list of all words
all_words = [word for item in list(data['tokenized']) for word in item]

# use nltk fdist to get a frequency distribution of all words
fdist = FreqDist(all_words)

In [9]:
k = 10000
# define a function only to keep words in the top k words
top_k_words,_ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)
def keep_top_k_words(text):
    return [word for word in text if word in top_k_words]

data['tokenized'] = data['tokenized'].apply(keep_top_k_words)

In [10]:
# document length
data['token_len'] = data['tokenized'].apply(lambda x: len(x))

In [11]:
# only keep articles with more than 10 tokens, otherwise too short
data = data[data['tokenized'].map(len) >= 10]
# make sure all tokenized items are lists
data = data[data['tokenized'].map(type) == list]
data.reset_index(drop=True,inplace=True)

In [12]:
#text = data['tokenized']

In [13]:
dictionary = gensim.corpora.Dictionary(data['tokenized'])
dictionary.filter_extremes(keep_n=8000)
bow_corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]

In [14]:
coherence_lda = []
for i in range(5,25,5):
    for j in range(100,600,200):
        print(i,j)
        lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics= i, id2word=dictionary, random_state=100, chunksize=j, passes=15, workers=4)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=data['tokenized'], dictionary=dictionary, coherence='c_v')
        coherence_lda.append(coherence_model_lda.get_coherence())

5 100
5 300
5 500
10 100
10 300
10 500
15 100
15 300
15 500
20 100
20 300
20 500


In [15]:
Coh_DF = pd.DataFrame([5,5,5,10,10,10,15,15,15,20,20,20])
Chunk_Size = pd.DataFrame([100,300,500,100,300,500,100,300,500,100,300,500])
coherence_lda = pd.DataFrame(coherence_lda)
Coh_DF["Chunk_Size"] = Chunk_Size
Coh_DF["Coherence_Score"] = coherence_lda/100
Coh_DF

Unnamed: 0,0,Chunk_Size,Coherence_Score
0,5,100,0.004747
1,5,300,0.00501
2,5,500,0.005124
3,10,100,0.005657
4,10,300,0.006646
5,10,500,0.007388
6,15,100,0.006545
7,15,300,0.006224
8,15,500,0.00588
9,20,100,0.006693


In [16]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, random_state=100, chunksize=500, passes=15, workers=4)

In [17]:
# Compute Perplexity
#print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data['tokenized'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print ('\nCoherence Score : %.0f' % (100*coherence_lda) + r'%')


Coherence Score : 74%


In [18]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.047*"mauritiu" + 0.029*"credit" + 0.027*"amount" + 0.026*"valu" + 0.025*"favour" + 0.025*"code" + 0.024*"debit" + 0.024*"asap" + 0.024*"scblmumu" + 0.024*"arrang"
Topic: 1 
Words: 0.061*"cash" + 0.044*"per" + 0.044*"request" + 0.043*"inform" + 0.042*"advis" + 0.040*"open" + 0.026*"record" + 0.026*"deal" + 0.025*"link" + 0.025*"daili"
Topic: 2 
Words: 0.039*"custodi" + 0.031*"cash" + 0.030*"bp" + 0.027*"depositori" + 0.026*"advis" + 0.021*"id" + 0.021*"investor" + 0.021*"th" + 0.017*"winoto" + 0.017*"fmo"
Topic: 3 
Words: 0.065*"address" + 0.052*"swift" + 0.033*"custodian" + 0.030*"settlement" + 0.029*"mark" + 0.028*"twyman" + 0.023*"euroclear" + 0.021*"fedwir" + 0.018*"bkengb" + 0.018*"co"
Topic: 4 
Words: 0.060*"forward" + 0.058*"deal" + 0.043*"amount" + 0.038*"detail" + 0.038*"follow" + 0.033*"advis" + 0.032*"market" + 0.032*"request" + 0.032*"nigeria" + 0.030*"fund"
Topic: 5 
Words: 0.071*"fx" + 0.051*"amount" + 0.051*"note" + 0.028*"request" + 0.025*"remit" + 0.0

In [19]:
import pyLDAvis
import pyLDAvis.gensim

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
vis

In [20]:
sent_topics_df = pd.DataFrame()

# Get main topic in each document
for i, row in enumerate(lda_model[bow_corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    # Get the Dominant topic, Perc Contribution and Keywords for each document
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
        else:
            break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

# Add original text to the end of the output
#contents = pd.Series(data)
sent_topics_df = pd.concat([sent_topics_df, data], axis=1)

In [21]:
df_dominant_topic = sent_topics_df
df_dominant_topic.columns = ['Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text','TextLength','Tokens','Number of Tokens']

# Show
#df_dominant_topic.head(10)
(df_dominant_topic.iloc[:,0:5]).to_csv("Output.csv",index = False)

In [22]:
# Number of Documents for Each Topic
topic_counts = sent_topics_df['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round((topic_counts/topic_counts.sum())*100, 2)

# Topic Number and Keywords

#topic_num_keywords = sent_topics_df[['Dominant_Topic', 'Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.to_csv("Topic_distribution.csv")

In [23]:
df_dominant_topics

Unnamed: 0,Num_Documents,Perc_Documents
9.0,323,23.47
8.0,153,11.12
1.0,142,10.32
7.0,123,8.94
5.0,118,8.58
0.0,113,8.21
3.0,108,7.85
2.0,106,7.7
6.0,99,7.19
4.0,91,6.61


In [34]:
data['tokenized']
df_data = data['tokenized']

In [45]:
import missingno as msno
msno.matrix(df_data)

IndexError: tuple index out of range

0       [ganghadharan, ananthakrishnan, andrew, liew, ...
1       [ganghadharan, ananthakrishnan, andrew, liew, ...
2       [secur, settlement, instruct, receiv, isin, cu...
3       [secur, settlement, instruct, receiv, isin, cu...
4       [secur, settlement, instruct, receiv, isin, cu...
5       [secur, take, instruct, convert, share, isin, ...
6       [secur, take, instruct, convert, share, isin, ...
7       [secur, take, instruct, convert, share, isin, ...
8       [safekeep, account, trade, credit, coupon, sec...
9       [safekeep, account, trade, credit, coupon, sec...
10      [safekeep, account, trade, credit, coupon, sec...
11      [safekeep, account, trade, credit, coupon, sec...
12      [bran, ch, repatri, amount, valu, jul, fx, rat...
13      [bran, ch, repatri, amount, valu, jul, fx, rat...
14      [bran, ch, repatri, amount, valu, jul, fx, rat...
15      [rmit, branch, account, scbsb, note, execut, t...
16      [rmit, branch, account, scbsb, note, execut, t...
17      [rmit,