# Classify Emails beyond spams

# LDA - Latent Dirichlet Allocation

### This notebook shows the implementation of LDA on the data + Synonyms + Hypernyms. Level 3
### Final Project - Riti Chakraborty¶

In [3]:
#Riti Chakraborty

#importing the required libraries
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer
import numpy
from numpy import nan

#for flattening lists
from itertools import chain

#To handle warning
import warnings
warnings.filterwarnings('ignore')

#For implementing Natural Language Processing approaches.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer

#For using Regular expression
import re

#For Handling Strings
import string

#For implementing word sense disambiguation
from nltk.corpus import wordnet as wn
from wordsegment import load, segment

#Important to call load()
load()

#For LDA Implementation # Importing Gensim
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora

#For evaluation Topic models formed
from gensim.models import CoherenceModel

#For visualising LDA Output
import pyLDAvis
import pyLDAvis.gensim 


In [4]:
#Reading the data exported from the previous file after initial preprocessing
data_subset=pd.read_csv("../exported_tables/data_subset.csv")
#converting the data in the dataframe into str type
data_subset = data_subset.astype(str) 


#retaining proper index
data_subset=data_subset.reset_index(drop=True)

num_of_rows=len(data_subset.index)
print("Total Number of rows: ",num_of_rows)

# Converting each row to list of lists
list1=data_subset.values.tolist()
list2=[]

for i in range(0, len(list1)):
    list2.append(' '.join(map(str, list1[i])))

#converting each row into vectors #Printing the count of each term in the emails #Emails on Rows and Terms on columns
vectorizer1 = CountVectorizer()
row_vectors=vectorizer1.fit_transform(list2).todense()


Total Number of rows:  1135


### Defining a Function for removing Stopwords

In [5]:
def stopword_remove(l2):
    fil_list2=[]
    for sent in l2:
        stop_words=["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "spam","yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "hibody", "body","these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
        word_tokens = word_tokenize(sent.lower())
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_sentence = []

        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)

        fil_list2.append(' '.join(filtered_sentence))
    return fil_list2


#Calling the function to remove stop words
fil_list2=stopword_remove(list2)

### Defining a function for punctuation removal

In [6]:
#Defining a function to remove punctuation
def no_punctuation(my_str):
    punctuations = '''!()-[]{};:'"\\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
        else:
            no_punct = no_punct + " "

# display the unpunctuated string
    return no_punct

### Defining function for word tokenization, segmentation and addition of Synonyms from the Wordnet API

In [7]:
def seg_syn(l1):
    wordlist2=[]
    wordlist2.append(no_punctuation(l1).split())
    seg=[]
    for w in wordlist2:
        no_integers = [x for x in w if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
        pure_string=[x for x in no_integers if not any(c.isdigit() for c in x)]
        for s in pure_string:
            seg.append(segment(s))
    flat_seg_list = [item for sublist in seg for item in sublist]
    d = {'Term':[], 'Synonyms':[], 'Hypernyms':[]}
    d1={}
    syn0=[]
    hyp0=[]
    for f in flat_seg_list:
#         print(f)
        d['Term'].append(f)        
        for syn in wn.synsets(f):
            synsets=syn
#             print("Hello")
            for l in syn.lemmas():
                syn0.append(l.name())
                d1[f]=l.name()
                d['Synonyms'].append(l.name())

            for synset in synsets.hyponyms():
                for lemma in synset.lemmas():
                    d['Hypernyms'].append(lemma.name())
                    hyp0.append(lemma.name())
#     print(d1)
#     print()
    return syn0,hyp0,flat_seg_list
#     return flat_seg_list


In [8]:

list2_syn=[]
for l in fil_list2:

    #Storing the synonyms and terms returned from the previous function
    syno,hyp,termz=seg_syn(str(l))
    list2_syn.append(syno+termz+hyp)

#flattening the list created from above
flat_list_syn_hyp=len(list(chain(*list2_syn)))

#Converting back to strings
flat_list2_syn=[]
for ls in list2_syn:
    flat_list2_syn.append(' '.join(list(set(ls))))

df_flatlist=pd.DataFrame(flat_list2_syn)
df_flatlist["Index"]=df_flatlist.index

doc_compt=[]
for text in flat_list2_syn:
    doc_compt.append(re.sub(r'\b\w{1,4}\b', '', str(text)))


### LDA on the data

In [9]:
random.seed(3425)
stop = set(stopwords.words('english'))
exclude_punct = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    
    #Removing Stop words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    #Removing Punctuation
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude_punct)
    
    #for stemming 
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    result = ''.join([i for i in normalized if not i.isdigit()])
    
    return result

doc_clean = [clean(doc).split() for doc in  doc_compt]        

#Creating a dictionary
dictionary = gensim.corpora.Dictionary(doc_clean)

#Creating a bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in doc_clean]

#LDA model
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('\n Topic: {} \nWords: {}'.format(idx, topic))        




 Topic: 0 
Words: 0.003*"grannie" + 0.003*"nanna" + 0.002*"grandma" + 0.002*"atomicnumber" + 0.002*"granny" + 0.002*"grandmother" + 0.002*"nanriver" + 0.002*"profit" + 0.001*"friday" + 0.001*"carbon"

 Topic: 1 
Words: 0.003*"state" + 0.002*"atomicnumber" + 0.002*"grandmother" + 0.002*"granny" + 0.002*"nanriver" + 0.002*"nanna" + 0.002*"grannie" + 0.002*"grandma" + 0.001*"express" + 0.001*"tuesday"

 Topic: 2 
Words: 0.005*"atomicnumber" + 0.004*"carbon" + 0.003*"cesium" + 0.003*"granny" + 0.003*"grandmother" + 0.003*"nanriver" + 0.003*"grandma" + 0.003*"nanna" + 0.003*"grannie" + 0.002*"activatedcarbon"

 Topic: 3 
Words: 0.005*"atomicnumber" + 0.003*"grandma" + 0.003*"grannie" + 0.003*"nanriver" + 0.003*"nanna" + 0.003*"granny" + 0.003*"grandmother" + 0.002*"carbon" + 0.002*"cesium" + 0.002*"tuesday"

 Topic: 4 
Words: 0.003*"grandmother" + 0.003*"granny" + 0.003*"nanna" + 0.003*"nanriver" + 0.003*"grandma" + 0.003*"grannie" + 0.003*"atomicnumber" + 0.002*"tuesday" + 0.002*"express"

### Computing Perplexity and Coherence Score

In [10]:
random.seed(3425)
# Compute Perplexity : lowest perplexity is considered the best.
perplexity=lda_model.log_perplexity(bow_corpus)
print('\n Perplexity of the Spam Classification model: ', perplexity)  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Coherence Score of the Spam Classification model: ', coherence_lda)


 Perplexity of the Spam Classification model:  -9.482338832093777

 Coherence Score of the Spam Classification model:  0.4211610772803847


# Visualization

In [11]:
pyLDAvis.enable_notebook()
id2word=dictionary
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis