### Import necessary libraries

In [1]:
import pandas as pd
import nltk
import re
import time
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('wordnet')

### Read data into dataframe

In [3]:
db=pd.read_csv('data/enron_spam_ham_email_clean.csv')
db.head(10)

Unnamed: 0,Subject,Message,label
0,christmas tree farm picture,message text,0
1,vastar resource inc .,gary production high island large block commen...,0
2,calpine daily gas nomination,calpine daily gas nomination .,0
3,issue,fyi see note already . stella forward stella l...,0
4,meter nov allocation,forward lauri allen hou ect pm kimberly vaughn...,0
5,mcmullen gas,jackie since inlet river plant shut last day f...,0
6,meter jan,george need follow jan zero receipt package id...,0
7,dun number change,fyi forward gary l payne hou ect pm antoine v ...,0
8,king ranch,two field gas difficulty unify system . cage r...,0
9,entex transistion,thanks much memo . would like reiterate suppor...,0


### Define processing functions

In [293]:
lemmatizer = WordNetLemmatizer()

def regex_processing(msg):
    msg=re.sub('[^a-z\.]'," ",msg.lower()) #lowercasing and removing every character which is not an alphabet or '.'
    msg=re.sub('\s+'," ",msg)              #removing extra whitespaces
    msg=re.sub('\.+\s+\.',".",msg)         #removing double dots to avoid issues during splitting into sentences
    return msg

def sentence_dropping(msg):
    n=2                                    #attempting to drop sentences which don't have more than 2 words in it
    tmp=[sent for sent in sent_tokenize(msg) if len(sent.split())>n]
    if not tmp:
        return msg
    return ' '.join(tmp)

ref_tag={'NN':'n','NNS':'n','NNP':'n','NNPS':'n',
         'VB':'v','VBD':'v','VBG':'v','VBN':'v','VBP':'v','VBZ':'v',
         'JJ':'a','JJR':'a','JJS':'a',
         'RB':'r','RBR':'r','RBS':'r',
         'ADJ_SAT':'s'}

stop_words=set(stopwords.words('english'))

def lemma(msg,def_txt):
    tmp=[]
    for sentence in sent_tokenize(sentence_dropping(regex_processing(msg))):
        tags=nltk.pos_tag(nltk.word_tokenize(sentence))
        lemmatized=' '.join([lemmatizer.lemmatize(word) if ref_tag.get(tag,"")=="" else lemmatizer.lemmatize(word,ref_tag.get(tag)) for word,tag in tags])
        
        filtered=lemmatized
        tmp_var=' '.join([word for word in word_tokenize(lemmatized) if word not in stop_words])
        if len(tmp_var):
            filtered=tmp_var
            
        tmp.append(sentence_dropping(filtered))
    
    if not len(tmp):
        return def_txt
    
    return ' '.join(tmp)

### Process 'Subjects' column

In [296]:
t=time.time()

for i in range(len(db['Subject'])):
    db.loc[i,'Subject']=lemma(db.loc[i,'Subject'],'subject')
    
print(f'Total Elapsed (Subject): {(time.time()-t)//60:.0f} m {(time.time()-t)%60:.0f} s')

Total Elapsed (Subject): 1 m 38 s


### Process 'Messages' column

In [298]:
t=time.time()

for i in range(len(db['Message'])):
    db.loc[i,'Message']=lemma(db.loc[i,'Message'],'message text')
    
print(f'Total Elapsed (Message): {(time.time()-t)//60:.0f} m {(time.time()-t)%60:.0f} s')

db.to_csv("data/enron_spam_ham_email_processed_v2.csv", index=False)

Total Elapsed (Subject): 16 m 26 s
