# Import necassary libraries and files

In [7]:
from pathlib import Path
import pandas as pd
from email import policy
from email.parser import BytesParser
import os
import re
from bs4 import BeautifulSoup
import numpy as np
import nltk.corpus
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import DutchStemmer
import spacy
import dutch_words
from imap_tools import MailBox, AND
lemmaModel = spacy.load('nl_core_news_lg', disable = ['parser','ner'])

# Checked wordlist
#dutchCorpusFile = open(Path(os.getcwd() + '/opentaal-wordlist-master/elements/basiswoorden-gekeurd.txt'))

# Unchecked wordlist
dutchCorpusFile = open(Path(os.getcwd() + '/opentaal-wordlist-master/wordlist.txt'))
dutchCorpusData = dutchCorpusFile.read()
dutchCorpus = dutchCorpusData.replace('\n', '.').split(".")
dutchCorpusFile.close()

# set column width to maximum for better visibility of data
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jensk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jensk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Define cleanup/pre-processing functions

In [8]:
# Function that extracts text from HTML tags
def ExtractHTML(contentInput):
    contentOutput = BeautifulSoup(contentInput).get_text()
    return contentOutput

# Function that removes escape characters (for example newlines)
def RemoveEscapeCharacters(contentInput):
    escapes = ''.join([chr(char) for char in range(1, 32)])
    #translator = str.maketrans(escapes, ' ')
    #content = content.translate(translator)
    contentOutput = re.sub(r'[' + escapes + r']',' ', contentInput)
    return contentOutput

# Function that removes URL's from mails (maybe not necassary)
def RemoveURLs(contentInput):
    contentOutput = re.sub(r'http\S+', ' ', contentInput)
    return contentOutput
    
# Function that removes extra whitespaces
def RemoveExcessWhitespaces(contentInput):
    contentOutput = re.sub(' {2,}', ' ', contentInput)
    return contentOutput

# Function for removing excess non-alphanumeric characters (and punctuation)
def RemoveNonAlphanumeric(contentInput, removePunctuation = False):
    # with punctuation
    if removePunctuation == True:
        contentOutput = re.sub(r'[^A-Za-z0-9 ]+', ' ',contentInput)
        
    # without punctuation
    else:
        contentOutput = re.sub(r'[^A-Za-z0-9 ,?.:;!_-]+', '',contentInput)
        contentOutput = re.sub(r'([,?.:;!_-]\s*){2,}','',contentOutput)
        
    return contentOutput

# Function for additional filtering related to privacy
def PrivacyFiltering(contentInput):
    contentOutput = re.sub(r'(BIC:) [A-Z]*',' ',contentInput)
    contentOutput = re.sub(r'\w*\d\w*', ' ', contentOutput).strip()
    return contentOutput

# function for text to lower case
def toLowerCase(contentInput):
    contentOutput = contentInput.lower()
    return contentOutput

# Stopwords removal function
def StopwordRemoval(contentInput, languageCode):
    stop = stopwords.words(languageCode)
    contentOutput =  " ".join([word for word in contentInput.split() if word not in (stop)])
    return contentOutput

# Stemming function
def Stemmer(contentInput):
    tokenizedWords = word_tokenize(contentInput, language='dutch')
    stemmedContent = []
    stemmer = DutchStemmer()
    for word in tokenizedWords:
        stemmedContent.append(stemmer.stem(word))
        stemmedContent.append(" ")
    return "".join(stemmedContent)

# Lemmatization function
def Lemmatizer(contentInput):
    document = lemmaModel(contentInput)
    return " ".join([token.lemma_ for token in document])

# Function to remove words not in dictionary
def RemoveNonDictionaryWords(contentInput):
        contentOutput =  " ".join([word for word in contentInput.split() if word in (dutchCorpus)])
        return contentOutput

# Extract content from the emails

In [3]:
# define and print path to .eml files (emails)
pathString = os.getcwd() + 'path to email files'
path = Path(pathString)
pathLength = len(pathString)
print(path)

# grab every file with the extension .eml
email_files = list(path.glob('*.eml'))

# create lists for the names and content of the emails + filecounter
names = []
contents = []
counter = 1
fileCount = len(email_files)
totalCharacterCount = 0

# loop over all found files
for email in email_files:
    
    #open each file in read bytes mode
    with open(email,'rb') as filepointer:
        
        # name is original filename minus the path and extension
        name = filepointer.name[pathLength:-4]
        
        # Parse data from email to message object
        message = BytesParser(policy=policy.default).parse(filepointer)
        
    # pass the plain text from the body of the email to a string variable. If no plain text is availible, 
    # just pass everything in the body
    try:
        content = message.get_body(preferencelist=('plain')).get_content()
    except:
        content = message.get_body().get_content()
    
    # Extract text from any HTML that is present.
    content = ExtractHTML(content)
    
    # Remove escape characters (for example newlines)
    content = RemoveEscapeCharacters(content)
    
    # Remove any non-ascii characters
    # content = content.encode('ascii', errors='ignore').decode()
    
    # Remove URL's from mails (maybe not necassary)
    content = RemoveURLs(content)  
    
    # Additional filtering for privacy
    content = PrivacyFiltering(content)
    
    # Remove non-alphanumeric characters
    content = RemoveNonAlphanumeric(content)
    
    # remaining text to lower case
    #content = toLowerCase(content)
    
    # Get amount of characters in all text
    totalCharacterCount += len(content)
     
    # remove stopwords
    #content = StopwordRemoval(content, 'dutch')
    
    # Stemming or lemmatization
    #content = Stemmer(content)
    #content = Lemmatizer(content)
    
    # remove words that are not in a dictionary (dutch in this case)
    #content = RemoveNonDictionaryWords(content)
    
    # Remove extra whitespaces
    content = RemoveExcessWhitespaces(content)
    
    # add name and content of current email to their respective lists
    names.append(name)
    contents.append(content)
    
    #close the current file
    filepointer.close()
    
    # filecounter
    print("Counter: " + str(counter) + '/' + str(fileCount), end="\r")
    counter += 1
    
print('Total character count: ' + str(totalCharacterCount))

C:\Users\kerseje\Bachelerproef jupyter notebooks\BrainjarMails
Counter: 80/1855



Total character count: 1851465


### Turn lists into dataframe for easy exploration

In [4]:
dfNames = pd.DataFrame([names, contents]).T
dfNames.columns = ['names', 'contents']

### Set class index based on title

In [5]:
dfNames['classIndex'] = 0
dfNames['classIndex'] = np.where(dfNames['names'].str.contains('facturen'), 1, dfNames['classIndex'])
dfNames['classIndex'] = np.where(dfNames['names'].str.contains('aanmaningen'), 2, dfNames['classIndex'])

Class list:
- 0 = Other
- 1 = invoice
- 2 = Payement reminder

### Drop columns wich contain Nan/Null

In [6]:
print(dfNames.isna().sum().sum())
dfNames = dfNames.dropna()

0


### Display top 20 rows

In [None]:
dfNames.head(20)

### Make different dataframe for sentiment analysis

In [8]:
datasetSentiment = dfNames.drop(dfNames[dfNames['classIndex'] == 0].index,axis=0)
datasetSentiment.loc[datasetSentiment['classIndex'] == 1, 'classIndex'] = 0
datasetSentiment.loc[datasetSentiment['classIndex'] == 2, 'classIndex'] = 1

### Save dataframe to csv file

In [9]:
dfNames.to_csv('test_extraction_emails.csv')
datasetSentiment.to_csv('sentiment_exctraction_emails.csv')

# Same operations on mailbox e-mails
(see 'reading e-mails from inbox.ipynb' for reading script)

### Password function

In [10]:
def GetPasswordFromFile(filePath):
    f = open(filePath,"r")
    password = f.read()
    f.close()
    return password

### Reading script

In [None]:
mb = MailBox('<mailbox>').login('<email adress>',GetPasswordFromFile(os.getcwd() + '<password file>'))
messages = mb.fetch(criteria=AND(seen=False), mark_seen=False, bulk=True)

contents = []
for msg in messages:
    
    content = msg.text
    
    content = ExtractHTML(content)
    content = RemoveEscapeCharacters(content)
    content = RemoveURLs(content)
    content = PrivacyFiltering(content)
    content = RemoveNonAlphanumeric(content)
    content = RemoveExcessWhitespaces(content)
    
    contents.append(content)
    
contentsDf = pd.DataFrame(contents, columns=['contents'])
contentsDf

### Save to csv file

In [12]:
contentsDf.to_csv('inbox_emails.csv')

# Clean second dataset

In [15]:
def CleanNewMails(mailSet):
    contents = []
    counter = 1
    fileCount = len(mailSet)
    totalCharacterCount = 0
    
    for email in mailSet:
        with open(email,'rb') as filepointer:
            message = BytesParser(policy=policy.default).parse(filepointer)
        
        try:
            content = message.get_body(preferencelist=('plain')).get_content()
        except:
            content = message.get_body().get_content()
        
        content = ExtractHTML(content)
    
        content = RemoveEscapeCharacters(content)
    
        content = RemoveURLs(content)  
    
        content = PrivacyFiltering(content)
    
        content = RemoveNonAlphanumeric(content)
        
        totalCharacterCount += len(content)

        content = RemoveExcessWhitespaces(content)
        
        content = toLowerCase(content)
    
        contents.append(content)
    
        filepointer.close()
    
        print("Counter: " + str(counter) + '/' + str(fileCount), end="\r")
        counter += 1
    
    print('Total character count: ' + str(totalCharacterCount))
    return contents

In [16]:
primaryPathString = os.getcwd() + 'path to email files'

mailsAndere = list(Path(primaryPathString + '//Andere').glob('*.eml'))
mailsCN = list(Path(primaryPathString + '//CN').glob('*.eml'))
mailsOrders = list(Path(primaryPathString + '//Orders').glob('*.eml'))
mailsRappels = list(Path(primaryPathString + '//Rappels').glob('*.eml'))
mailsTechnoCargo = list(Path(primaryPathString + '//TechnoCargo').glob('*.eml'))

mailsAndereClean = pd.DataFrame(CleanNewMails(mailsAndere), columns=['contents'])
mailsCNClean = pd.DataFrame(CleanNewMails(mailsCN), columns=['contents'])
mailsOrdersClean = pd.DataFrame(CleanNewMails(mailsOrders), columns=['contents'])
mailsRappelsClean = pd.DataFrame(CleanNewMails(mailsRappels), columns=['contents'])
mailsTechnoCargoClean = pd.DataFrame(CleanNewMails(mailsTechnoCargo), columns=['contents'])

Total character count: 7256554
Total character count: 108205
Total character count: 8193367
Total character count: 694097
Total character count: 332180


### Save to csv files

In [17]:
savePath = 'path to save files'
mailsAndereClean.to_csv(savePath + '\Andere.csv')
mailsCNClean.to_csv(savePath + '\CN.csv')
mailsOrdersClean.to_csv(savePath + '\Orders.csv')
mailsRappelsClean.to_csv(savePath + '\Rappels.csv')
mailsTechnoCargoClean.to_csv(savePath + '\TechnoCargo.csv')

# Conclusion:
E-mails need a lot of cleaning to extract just the text and leave metacharacters (such as HTML or escape characters) out of the processed results. Beyond that we need more specific preprocessing steps, determined by the model that it will feed through. Unsupervised models will need stopword removal and lemmitazation/stemming to achieve higher performance, while transfer learning models like BERT will be hurt by these preprocessing steps.

#### Sources:
- https://stackoverflow.com/questions/8115261/how-to-remove-all-the-escape-sequences-from-a-list-of-strings
- https://enjoylifescience.com/2020/11/05/analyzing-emails-in-python/
- https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
- https://towardsdatascience.com/remove-personal-information-from-text-with-python-232cb69cf074
- https://monkeylearn.com/blog/text-cleaning/#:~:text=Text%20cleaning%20can%20be%20performed,words%20to%20their%20root%20form.&text=You'd%20need%20to%20perform,Removing%20Stopwords
- https://www.datacamp.com/tutorial/stemming-lemmatization-python
- https://www.projectpro.io/recipes/use-spacy-lemmatizer
- https://pypi.org/project/dutch-words/ (Original dictionary, replaced by Opentaal wordlist)
- https://github.com/OpenTaal/opentaal-wordlist

# License

OpenTaal wordlist: Hagen, H., Brouwer, S., Baars, R., Roeckx, K., Maryns, H., Waalboer, J., & Knubben, B. (2017, Februari 16). opentaal-wordlist. Opgehaald van Github: https://github.com/OpenTaal/opentaal-wordlist