In [1]:
import pandas as pd
import os
os.chdir("C:\Siddhartha\Confidential Documents\Data Science\MT 199")
dataset=pd.read_csv("Source.csv")

In [2]:
# import dependencies
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import FreqDist

import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
from gensim.models import CoherenceModel

import re



In [3]:
# Splitting the dataset into the Training set and Test set
field79=dataset["Text"]
df=pd.DataFrame(field79)

In [4]:
# document length
df['TextLength'] = df['Text'].str.len()

In [5]:
mean = np.average(df['TextLength'])
sd = np.std(df['TextLength'])

data = df[df['Text'].map(len) >= mean-2*sd]
data = data[data['Text'].map(len) <= mean+2*sd]

In [6]:
def initial_clean(text):
    """
    Function to clean text of websites, email addresess and any punctuation
    We also lower case the text
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", " ", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

stop_words = pd.read_csv("stop word list.csv")
stop_words = stop_words['i'].tolist()
def remove_stop_words(text):
    """
    Function that removes all stopwords from text
    """
    return [word for word in text if word not in stop_words]


stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    text = [stemmer.stem(word) for word in text]
    text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    
    return text

def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return remove_stop_words(stem_words(remove_stop_words(initial_clean(text))))

In [7]:
# clean text and title and create new column "tokenized"
data['tokenized'] = data['Text'].apply(apply_all)

In [8]:
# first get a list of all words
all_words = [word for item in list(data['tokenized']) for word in item]

# use nltk fdist to get a frequency distribution of all words
fdist = FreqDist(all_words)
fdist

FreqDist({'attent': 2010,
          'foreign': 450,
          'depart': 1827,
          'order': 5938,
          'field': 5035,
          'invoic': 861,
          'notifi': 117,
          'msg': 410,
          'sent': 1502,
          'currenc': 971,
          'amount': 7483,
          'period': 454,
          'inform': 5017,
          'receiv': 3545,
          'relev': 212,
          'credit': 6765,
          'entri': 311,
          'case': 4475,
          'alreadi': 571,
          'done': 122,
          'need': 599,
          'addit': 510,
          'detail': 5666,
          'contact': 1228,
          'mail': 368,
          'estrmtar': 57,
          'ubi': 91,
          'unicredit': 84,
          'eu': 975,
          'alway': 598,
          'quot': 10073,
          'co': 1071,
          'oper': 2459,
          'charg': 3358,
          'dept': 3300,
          'yr': 4218,
          'advis': 5352,
          'pymt': 917,
          'vd': 1288,
          'relat': 4928,
          'scblae': 1

In [9]:
k = 10000
# define a function only to keep words in the top k words
top_k_words,_ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)
def keep_top_k_words(text):
    return [word for word in text if word in top_k_words]

data['tokenized'] = data['tokenized'].apply(keep_top_k_words)

In [10]:
# document length
data['token_len'] = data['tokenized'].apply(lambda x: len(x))

In [11]:
# only keep articles with more than 10 tokens, otherwise too short
data = data[data['tokenized'].map(len) >= 10]
# make sure all tokenized items are lists
data = data[data['tokenized'].map(type) == list]
data.reset_index(drop=True,inplace=True)

In [12]:
text = data['tokenized']

In [13]:
dictionary = gensim.corpora.Dictionary(text)
dictionary.filter_extremes(keep_n=8000)
bow_corpus = [dictionary.doc2bow(doc) for doc in text]

In [14]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, random_state=100, chunksize=500, passes=15, workers=4)

In [15]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print ('\nCoherence Score : %.0f' % (100*coherence_lda) + r'%')


Perplexity:  -6.180502042054094

Coherence Score : 58%


In [16]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.035*"name" + 0.032*"number" + 0.029*"order" + 0.019*"address" + 0.019*"account" + 0.015*"amount" + 0.014*"attn" + 0.014*"birth" + 0.014*"detail" + 0.013*"inform"
Topic: 1 
Words: 0.071*"quot" + 0.054*"correspond" + 0.050*"futur" + 0.048*"relat" + 0.046*"case" + 0.040*"enquiri" + 0.038*"servic" + 0.034*"account" + 0.027*"request" + 0.021*"remit"
Topic: 2 
Words: 0.051*"amount" + 0.045*"fx" + 0.036*"chf" + 0.028*"return" + 0.027*"bene" + 0.025*"rate" + 0.024*"fund" + 0.022*"sellback" + 0.022*"oper" + 0.020*"origin"
Topic: 3 
Words: 0.034*"fund" + 0.026*"return" + 0.023*"investig" + 0.022*"request" + 0.016*"quot" + 0.016*"amount" + 0.016*"valu" + 0.015*"advis" + 0.014*"null" + 0.014*"receiv"
Topic: 4 
Words: 0.048*"field" + 0.039*"yr" + 0.036*"bnf" + 0.022*"dept" + 0.018*"transfer" + 0.017*"intern" + 0.016*"swift" + 0.016*"trn" + 0.016*"ft" + 0.015*"branch"
Topic: 5 
Words: 0.034*"beneficiari" + 0.030*"credit" + 0.026*"claim" + 0.025*"fund" + 0.023*"receipt" + 0.023*"in

In [17]:
input = '''WE REFER TO OUR OUR TRN 2018031500030756 DATED
3/15/2018 FOR USD 25323.52 .
.
WE HAVE BEEN ADVISED BY THE REMITTING BANK THAT
THE ULTIMATE BENEFICIARY IS CLAIMING NON RECEIPT
OF THIS PAYMENT. PLEASE URGENTLY RESPOND BY
CONFIRMING THE DATE, AMOUNT AND CREDIT REFERENCE
TO THE ULTIMATE BENEFICIARY ACCOUNT.
USP180402-001730.
REGARDS
CLIENT SERVICES
STANDARD CHARTERED BANK'''

In [18]:
bow_test = dictionary.doc2bow(apply_all(input))

In [19]:
for index, score in sorted(lda_model[bow_test], key=lambda tup: -1*tup[1]):
    
    if index==0:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request for more information")
    if index==1:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request for more information / Inquiry - Quote reference number while reverting")
    if index==2:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Processing request related to FX / sellback")
    if index==3:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Cancellation of swift message / Debit authority / Return of fund request / Return of fund advises")
    if index==4:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Internal Swift transfer message")
    if index==5:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Beneficiery claims non receipt of funds")
    if index==6:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request for processing confirmation / status of a payment")
    if index==7:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Missing/incorrect information - unable to apply")
    if index==8:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request to amend certain fields in swift message")
    if index==9:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : FATF / AML / Sanctions questionnaire")

Probability Score : 75% 
Topic : Beneficiery claims non receipt of funds
Probability Score : 21% 
Topic : Request for more information / Inquiry - Quote reference number while reverting


In [20]:
x = 1999

print(data['Text'][x])

for index, score in sorted(lda_model[bow_corpus[x]], key=lambda tup: -1*tup[1]):
    if index==0:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request for more information")
    if index==1:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request for more information / Inquiry - Quote reference number while reverting")
    if index==2:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Processing request related to FX / sellback")
    if index==3:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Cancellation of swift message / Debit authority / Return of fund request / Return of fund advises")
    if index==4:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Internal Swift transfer message")
    if index==5:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Beneficiery claims non receipt of funds")
    if index==6:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request for processing confirmation / status of a payment")
    if index==7:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Missing/incorrect information - unable to apply")
    if index==8:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : Request to amend certain fields in swift message")
    if index==9:
       print("Probability Score : "+('%.0f' % (100*score) + r'%')+" \nTopic : FATF / AML / Sanctions questionnaire")

RE FX CONV P/O AZMA807800543600 VAL
UE DATE 20-MAR-18 ORIGINAL AMOUNT 3
140.00 / EUR AGAINST THE BENE DEDUC
T AMOUNT 3140.00 / EUR PAYMENT WAS
EFFECTED IN ACCORDANCE WITH INSTRUC
TIONS.HOWEVER, THE FUNDS WERE RETUR
NED BY BENE BANK FOR REASON  UTA AN
D NO AMEND RECEIVED WE HAVE PERFORM
ED SELLBACK FX USING MARKET EXCHANG
E RATE   OF 134.103957 AND OUR EUR
GPS OPERATING ACCOUNT 18502902 WILL
BE CREDITED WITH 2987.19 / EUR ON
VALUE DATE 10-APR-18 BANK CHARGES:
22.37 / EUR TOP UP REQUIRED: 130.44
/ EUR REGARDS GPS OPERATIONS SSC,
CHENNAI

Probability Score : 95% 
Topic : Processing request related to FX / sellback
Probability Score : 3% 
Topic : Request to amend certain fields in swift message


In [21]:
coherence_lda = []
for i in range(5,25,5):
    for j in range(100,600,200):
        print(i,j)
        lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics= i, id2word=dictionary, random_state=100, chunksize=j, passes=15, workers=4)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=data['tokenized'], dictionary=dictionary, coherence='c_v')
        coherence_lda.append(coherence_model_lda.get_coherence())
    

Coh_DF = pd.DataFrame([5,5,5,10,10,10,15,15,15,20,20,20])
Chunk_Size = pd.DataFrame([100,300,500,100,300,500,100,300,500,100,300,500])
coherence_lda = pd.DataFrame(coherence_lda)
Coh_DF["Chunk_Size"] = Chunk_Size
Coh_DF["Coherence_Score"] = coherence_lda/100
Coh_DF

5 100
5 300
5 500
10 100
10 300
10 500
15 100
15 300
15 500
20 100
20 300
20 500


Unnamed: 0,0,Chunk_Size,Coherence_Score
0,5,100,0.005453
1,5,300,0.005311
2,5,500,0.005094
3,10,100,0.005534
4,10,300,0.005677
5,10,500,0.005788
6,15,100,0.005353
7,15,300,0.005562
8,15,500,0.005798
9,20,100,0.005094


In [22]:
sent_topics_df = pd.DataFrame()

# Get main topic in each document
for i, row in enumerate(lda_model[bow_corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    # Get the Dominant topic, Perc Contribution and Keywords for each document
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
        else:
            break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

# Add original text to the end of the output
#contents = pd.Series(data)
sent_topics_df = pd.concat([sent_topics_df, data], axis=1)

In [23]:
df_dominant_topic = sent_topics_df
df_dominant_topic.columns = ['Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text','TextLength','Tokens','Number of Tokens']

# Show
#df_dominant_topic.head(10)
(df_dominant_topic.iloc[:,0:5]).to_csv("Output.csv",index = False)

In [24]:
# Number of Documents for Each Topic
topic_counts = sent_topics_df['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round((topic_counts/topic_counts.sum())*100, 2)

# Topic Number and Keywords

#topic_num_keywords = sent_topics_df[['Dominant_Topic', 'Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.to_csv("Topic_distribution.csv")