In [1]:
import pandas as pd
import numpy as np
import re
import string
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS
import matplotlib.pyplot as plt


In [2]:
data= pd.read_csv(r'C:\Users\p_uli\Desktop\Columbia University\Cursos\Fall 22\Capstone\Data\data_uniques.csv')

In [3]:


def parse(email_text, remove_quoted_statements=True):
    email_text = email_text.strip()
    email_text = strip_automated_notation(email_text)    
    if remove_quoted_statements:
        pattern = """(?P<quoted_statement>".*?")"""
        matches = re.findall(pattern, email_text, re.IGNORECASE + re.DOTALL)
        for m in matches:
            email_text = email_text.replace(m, '"[quote]"')
    result = { \
              "salutation":get_salutation(email_text), \
              "body":get_body(email_text), \
              "signature":get_signature(email_text), \
              "reply_text":get_reply_text(email_text) \
              }
    return result

#automated_notation could be any labels or sections in the email giving special notation for
#human readers of the email text. For example, email_text may start with "A message from your customer:"
def strip_automated_notation(email_text):
    #Use a paramater name email_text to indicate text in the actual email message
    notations = [
                 "Hi, there has been a new enquiry from\..*?Enquiry:(?P<email_message>.*)",
                 ]
    for n in notations:
        groups = re.match(n, email_text, re.IGNORECASE + re.DOTALL)
        if not groups is None:
            #if groups.groupdict().has_key("email_message"):
            if "email_message" in groups.groupdict():
                email_text = groups.groupdict()["email_message"]
    
    return email_text
    

def get_reply_text(email_text):
    #Notes on regex
    #Search for classic prefix from GMail and other mail clients "On May 16, 2011, Dave wrote:"
    #Search for prefix from outlook clients From: Some Person [some.person@domain.tld]
    #Search for prefix from outlook clients when used for sending to recipients in the same domain From: Some Person\nSent: 16/05/2011 22:42\nTo: Some Other Person
    #Search for prefix when message has been forwarded
    #Search for From: <email@domain.tld>\nTo: <email@domain.tld>\nDate:<email@domain.tld
    #Search for From:, To:, Sent:
    #Some clients use -*Original Message-*
    pattern = "(?P<reply_text>" + \
        "On ([a-zA-Z0-9, :/<>@\.\"\[\]]* wrote:.*)|" + \
        "From: [\w@ \.]* \[mailto:[\w\.]*@[\w\.]*\].*|" + \
        "From: [\w@ \.]*(\n|\r\n)+Sent: [\*\w@ \.,:/]*(\n|\r\n)+To:.*(\n|\r\n)+.*|" + \
        "[- ]*Forwarded by [\w@ \.,:/]*.*|" + \
        "From: [\w@ \.<>\-]*(\n|\r\n)To: [\w@ \.<>\-]*(\n|\r\n)Date: [\w@ \.<>\-:,]*\n.*|" + \
        "From: [\w@ \.<>\-]*(\n|\r\n)To: [\w@ \.<>\-]*(\n|\r\n)Sent: [\*\w@ \.,:/]*(\n|\r\n).*|" + \
        "From: [\w@ \.<>\-]*(\n|\r\n)To: [\w@ \.<>\-]*(\n|\r\n)Subject:.*|" + \
        "(-| )*Original Message(-| )*.*)"
    groups = re.search(pattern, email_text, re.IGNORECASE + re.DOTALL)
    reply_text = None
    if not groups is None:
        #if groups.groupdict().has_key("reply_text"):
        if "reply_text" in groups.groupdict():
            reply_text = groups.groupdict()["reply_text"]
    return reply_text
    

def get_signature(email_text):
    
    #try not to have the signature be the very start of the message if we can avoid it
    salutation = get_salutation(email_text)
    if salutation: email_text = email_text[len(salutation):]
    
    #note - these openinged statements *must* be in lower case for 
    #sig within sig searching to work later in this func
    sig_opening_statements = [
                              "warm regards",
                              "kind regards",
                              "regards",
                              "cheers",
                              "many thanks",
                              "thanks",
                              "sincerely",
                              "ciao",
                              "Best",
                              "bGIF",
                              "thank you",
                              "thankyou",
                              "talk soon",
                              "cordially",
                              "yours truly",
                              "thanking You",
                              "sent from my iphone"]
    #pattern = "(?P<signature>(" + string.joinfields(sig_opening_statements, "|") + ")(.)*)"
    pattern = "(?P<signature>(" + "|".join(sig_opening_statements) + ")(.)*)"
    groups = re.search(pattern, email_text, re.IGNORECASE + re.DOTALL)
    signature = None
    if groups:
        #if groups.groupdict().has_key("signature"):
        if "signature" in groups.groupdict():
            signature = groups.groupdict()["signature"]
            reply_text = get_reply_text(email_text[email_text.find(signature):])
            if reply_text: signature = signature.replace(reply_text, "")
            
            #search for a sig within current sig to lessen chance of accidentally stealing words from body
            tmp_sig = signature
            for s in sig_opening_statements:
                if tmp_sig.lower().find(s) == 0:
                    tmp_sig = tmp_sig[len(s):]
            groups = re.search(pattern, tmp_sig, re.IGNORECASE + re.DOTALL)
            if groups: signature = groups.groupdict()["signature"]
        
    #if no standard formatting has been provided (e.g. Regards, <name>), 
    #try a probabilistic approach by looking for phone numbers, names etc. to derive sig    
    if not signature:
        #body_without_sig = get_body(email_text, check_signature=False)
        pass
    
    #check to see if the entire body of the message has been 'stolen' by the signature. If so, return no sig so body can have it.
    body_without_sig = get_body(email_text, check_signature=False)
    if signature==body_without_sig: signature = None
    
    return signature

#todo: complete this out (I bit off a bit more than I could chew with this function. Will probably take a bunch of basian stuff
def is_word_likely_in_signature(word, text_before="", text_after=""):
    #Does it look like a phone number?
    
    #is it capitalized?
    if word[:1] in string.ascii_uppercase and word[1:2] in string.ascii_lowercase: return True
    
    return
    
#check_<zone> args provided so that other functions can call get_body without causing infinite recursion
def get_body(email_text, check_salutation=True, check_signature=True, check_reply_text=True):
    
    if check_salutation:
        sal = get_salutation(email_text)
        if sal: email_text = email_text[len(sal):]
    
    if check_signature:
        sig = get_signature(email_text)
        if sig: email_text = email_text[:email_text.find(sig)]
    
    if check_reply_text:
        reply_text = get_reply_text(email_text)
        if reply_text: email_text = email_text[:email_text.find(reply_text)]
            
    return email_text

def get_salutation(email_text):
    #remove reply text fist (e.g. Thanks\nFrom: email@domain.tld causes salutation to consume start of reply_text
    reply_text = get_reply_text(email_text)
    if reply_text: email_text = email_text[:email_text.find(reply_text)]
    #Notes on regex:
    #Max of 5 words succeeding first Hi/To etc, otherwise is probably an entire sentence
    salutation_opening_statements = [
                                     "hi",
                                     "dear",
                                     "to",
                                     "hey",
                                     "hello",
                                     "thanks",
                                     "good morning",
                                     "good afternoon",
                                     "good evening",
                                     "thankyou",
                                     "thank you"]
    pattern = "\s*(?P<salutation>(" + "|".join(salutation_opening_statements) + ")+(\s*\w*)(\s*\w*)(\s*\w*)(\s*\w*)(\s*\w*)[\.,\xe2:]+\s*)"
    groups = re.match(pattern, email_text, re.IGNORECASE)
    salutation = None
    if not groups is None:
        #if groups.groupdict().has_key("salutation"):
        if "salutation" in groups.groupdict():
            salutation = groups.groupdict()["salutation"]
    return salutation
    

In [4]:
print(data['body'][4])
print(parse(data['body'][4]))

Dave, 

 Here are the names of the west desk members by category.  The origination 
side is very sparse.  





Phillip

{'salutation': None, 'body': 'Dave, \n\n Here are the names of the west desk members by category.  The origination \nside is very sparse.  \n\n\n\n\n\nPhillip', 'signature': None, 'reply_text': None}


In [5]:
print(data['body'][23499])
print(parse(data['body'][23499]))

See attached for this months liquids gathered from the sample run. results will follow. Thanks,Randy																																								 
{'salutation': None, 'body': 'See attached for this months liquids gathered from the sample run. results will follow. ', 'signature': 'Thanks,Randy', 'reply_text': None}


In [6]:
data['subject'][0].find('Dear')

-1

In [7]:
def find_salutation(x):
    try:
        a=x.find('Dear')
        if a<0:
            return False
        else:
            return True
    except:
        return False




In [8]:
salutations=data[data['body'].apply(lambda x: find_salutation(x))]
salutations.reset_index(inplace=True,drop=True)

In [9]:
salutations.head()

Unnamed: 0,message_id,date,from,to,subject,cc,bcc,body
0,allen-p\all_documents\239,"Mon, 26 Jun 2000 06:57:00 -0700 (PDT)",phillip.allen@enron.com,keith.holst@enron.com,Download Frogger before it hops away!,,,---------------------- Forwarded by Phillip K ...
1,allen-p\all_documents\3,"Wed, 13 Dec 2000 13:28:00 -0800 (PST)",subscriptions@intelligencepress.com,pallen@enron.com,"NGI Publications - Thursday, 14 December 2000",,,"Dear phillip,\n\n\nThis e-mail is automated no..."
2,allen-p\all_documents\307,"Fri, 3 Mar 2000 00:39:00 -0800 (PST)",phillip.allen@enron.com,matthew.lenhart@enron.com,Just Released! Exclusive new animation from St...,,,---------------------- Forwarded by Phillip K ...
3,allen-p\all_documents\355,"Mon, 14 May 2001 09:39:00 -0700 (PDT)",ei_editor@ftenergy.com,einsighthtml@spector.ftenergy.com,Texas puts reliability rules through paces,,,Dear Energy Insight Subscribers.?If you canno...
4,allen-p\all_documents\358,"Mon, 14 May 2001 07:10:00 -0700 (PDT)",announce@inbox.nytimes.com,pallen@ect.enron.com,Pre-selected NextCard Visa! As low as 2.99%,,,"Dear NYTimes.com member,\n\nYour registration ..."


## HTML examples

In [10]:
def has_html(x):
    return '<html>' in x

In [11]:
html_data=data[data['body'].apply(lambda x: has_html(x))]
html_data.reset_index(inplace=True,drop=True)

In [12]:
print('% of emails with html')
print(round(html_data.shape[0]/data.shape[0]*100,2))
print('# of emails with html')
print(round(html_data.shape[0]))

% of emails with html
0.33
# of emails with html
848


In [13]:
html_data.head()

Unnamed: 0,message_id,date,from,to,subject,cc,bcc,body
0,allen-p\deleted_items\73,"Thu, 18 Oct 2001 00:41:45 -0700 (PDT)",oportunity@cells4free.com,pallen@enron.com,Free 2001 Cell Phones..!,,,<!--To put this html into an existing HTML doc...
1,allen-p\inbox\16,"Mon, 31 Dec 2001 15:26:05 -0800 (PST)",exclusive_offers@sportsline.com,pallen@enron.com,Ring in the New Year with Pizza Hut,,,<!--\n========================================...
2,arnold-j\deleted_items\524,"Sun, 28 Oct 2001 15:22:24 -0800 (PST)",specs@wineisit.com,jarnold@ect.enron.com,"GREAT SAVINGS FROM Spec's Wines, Spirits & Fin...",,,<html>\n<head>\n<title>WineISIT.com - Member E...
3,arnold-j\deleted_items\526,"Sun, 28 Oct 2001 01:12:24 -0800 (PST)",newsletter@nakedwomansex.com,john.arnold@enron.com,Protect your Privacy,,,<html>\n<head>\n<title>Internet Eraser</title>...
4,arnold-j\deleted_items\663,"Tue, 20 Nov 2001 14:08:17 -0800 (PST)",specials@vegasinsider.com,jarnold@ect.enron.com,Thanksgiving Bonus from Sportsbook.com,,,<!--\nSPORTSBOOK.COM - www.sportsbook.com\nT...


In [14]:
# subjects
def tolower(x):
    try:
        return x.lower()
    except:
        return ''

def remove_hash_symbol(text):
    text = re.sub(r'#', '', text)
    return text
def remove_punctuation(words):
    new_word = re.sub(r'[^\w\s]', '', words)
    return new_word

def clean(x):
    x= tolower(x)
    x= remove_hash_symbol(x)
    x= remove_punctuation(x)
    return x


In [18]:
subs=html_data['subject'].apply(lambda x: clean(x))
subs


0                                  free 2001 cell phones
1                    ring in the new year with pizza hut
2      great savings from specs wines spirits  finer ...
3                                   protect your privacy
4                  thanksgiving bonus from sportsbookcom
                             ...                        
843                           the moneynet morning scoop
844     williams energy news live  todays video newscast
845                           the moneynet morning scoop
846                              air canadas websaver tm
847          extended websaver continues through january
Name: subject, Length: 848, dtype: object

In [19]:
subs2=' '.join(subs)

In [20]:

# total body
wc = WordCloud(
    # width=2200, height=2200,
      background_color='white',
          mode='RGB', 
      # mask=backgroud, # create mask for the workcloud
      max_words=500,
      #stopwords=set(['email','enroncom','enron','please','s']),
      stopwords = STOPWORDS,
      # font_path='C:\Windows\Fonts\STZHONGS.ttf',
      max_font_size=150,
      # relative_scaling=1, =
      random_state=50, 
      scale=2,
      collocations=False 
      ).generate(subs2)
    
  # image_color = ImageColorGenerator(backgroud) # set cloud color
  # wc.recolor(color_func=image_color)

plt.imshow(wc) 
plt.axis('off') 
plt.show()

NameError: name 'plt' is not defined

In [None]:

def get_domain(x):
    try:
        x=x[(x.find('@')+1):]
        return re.sub("[^A-Za-z.]","",x)
    except:
        return 


In [None]:
domains=html_data['from'].apply(lambda x: get_domain(x))

print(len(set(domains)))
pd.DataFrame(zip(list(domains.value_counts()[:22].index),domains.value_counts()[:22]),columns=['domain','count'],)

In [None]:
wc = WordCloud(
    # width=2200, height=2200,
      background_color='white',
          mode='RGB', 
      # mask=backgroud, # create mask for the workcloud
      max_words=500,
      #stopwords=set(['email','enroncom','enron','please','s']),
      #stopwords = stopwords,
      # font_path='C:\Windows\Fonts\STZHONGS.ttf',
      max_font_size=150,
      # relative_scaling=1, =
      random_state=50, 
      scale=2,
      collocations=False 
      ).generate(' '.join(domains))
    
  # image_color = ImageColorGenerator(backgroud) # set cloud color
  # wc.recolor(color_func=image_color)

plt.imshow(wc) 
plt.axis('off') 
plt.show()

In [None]:
data.shape