This notebook documents the preprocessing and feature engineering of the 250 emails' dataframe.

Original emails first to be need cleansed and formatted into a dataframe using the notebook 0_apache_camel_email_dataset_extraction.ipynb. 

SpaCy (version "en_core_web_sm") and Sense2Vec must be installed. See https://spacy.io/usage and https://github.com/explosion/sense2vec for instructions.

In [None]:
import_emails_path = "data/camel_emails_final_na_clean.csv" # Input CSV path for extracted emails and labels
s2v_model_path = "helpers/s2v" # Input path to Sense2Vec folder
save_path_final_df = "data/camel_emails_emb_s2v.csv" # Output CSV path for final preprocessed emails and labels

# Imports

In [None]:
import pandas as pd
import numpy as np
import spacy
from sense2vec import Sense2VecComponent
from sense2vec import Sense2Vec
import csv
import regex as re
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from collections import Counter
from itertools import chain
import numpy as np
import copy
import json
from spacy.pipeline import Sentencizer
import csv
import random
from sklearn.metrics.cluster import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import fcluster, dendrogram, linkage
from sklearn.metrics.pairwise import cosine_similarity
import datetime

s2v = Sense2Vec().from_disk(s2v_model_path)

np.set_printoptions(suppress=True)

# Load data

In [None]:
emails = pd.read_csv(import_emails_path, quoting=csv.QUOTE_ALL)
cols = ['Email_ID', 'Date', 'From', 'To', 'Subject', 'Body_clean', 'Workflow Label', 'Trace_ID', 'Actions', 'Accepted Traces', 'Action']
emails = emails[cols]

In [None]:
emails[['Email_ID', 'Date', 'From', 'To', 'Subject', 'Body_clean', 'Trace_ID', 'Action']].sort_values('Date')

Unnamed: 0,Email_ID,Date,From,To,Subject,Body_clean,Trace_ID,Action
211,43,2017-04-14 10:15:23-07:00,revathykuberan@gmail.com,['users@camel.apache.org'],Re: Multiple from end points traversing to dif...,Subject: Re: Multiple from end points travers...,9,ask a question
0,1,2017-04-14 10:42:39+00:00,lburgazzoli@apache.org,['commits@camel.apache.org'],camel git commit: HeaderSelectorProducer to su...,Subject: camel git commit: HeaderSelectorProd...,1,commit changes
1,2,2017-04-14 10:52:55+00:00,davsclaus@apache.org,['commits@camel.apache.org'],[1/6] camel git commit: Rename catalog to runt...,Subject: [1/6] camel git commit: Rename catal...,2,commit changes
2,3,2017-04-14 10:52:56+00:00,davsclaus@apache.org,['commits@camel.apache.org'],[2/6] camel git commit: Rename catalog to runt...,Subject: [2/6] camel git commit: Rename catal...,2,commit changes
3,4,2017-04-14 10:52:57+00:00,davsclaus@apache.org,['commits@camel.apache.org'],[3/6] camel git commit: Rename catalog to runt...,Subject: [3/6] camel git commit: Rename catal...,2,commit changes
...,...,...,...,...,...,...,...,...
118,239,2017-04-19 14:21:57+02:00,lburgazzoli@gmail.com,['dev@camel.apache.org'],Re: Camel 2.19 Roadmap,Subject: Re: Camel 2.19 Roadmap To: dev@camel...,24,version release planning
244,242,2017-04-19 14:29:00+02:00,claus.ibsen@gmail.com,['users@camel.apache.org'],Re: Spring Boot > 1.5 fails when camel-swagger...,Subject: Re: Spring Boot > 1.5 fails when cam...,64,ask a question
247,247,2017-04-19 14:35:40+02:00,zoran@regvart.com,['users@camel.apache.org'],Re: Spring Boot > 1.5 fails when camel-swagger...,Subject: Re: Spring Boot > 1.5 fails when cam...,64,provide support
249,249,2017-04-19 15:27:36+02:00,claus.ibsen@gmail.com,['users@camel.apache.org'],Re: Spring Boot > 1.5 fails when camel-swagger...,Subject: Re: Spring Boot > 1.5 fails when cam...,64,ask a question


# Preprocess data

Extract domain name from sender email.

In [None]:
emails['domain_w_tld'] = emails['From'].str.split('@').str[1]
emails['domain'] = emails['domain_w_tld'].str.split('.').str[0]
emails.drop(columns=['domain_w_tld'], inplace=True)

Replace missing body by subject line.

In [None]:
emails.loc[emails['Body_clean'].isna(), ['Body_clean']] = emails[emails['Body_clean'].isna()]['Subject']

Split body into sentences

In [None]:
nlp = spacy.load("en_core_web_sm")

stop_chars = ['\n', '\n\n', '!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '！', '．', '？', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '｡', '。']
config = {"punct_chars": stop_chars}
nlp.add_pipe("sentencizer", config=config)

def split_sentences(x):
    '''
        Extract sentences
            x: the text from which sentences should be extracted
    '''
    doc = nlp(x)
    sentences = [i for i in doc.sents]
    return sentences

In [None]:
emails['Body_unquoted'] = emails['Body_clean'].str.split('wrote:\n>').str[0]
emails['body_sentences'] = emails.Body_clean.map(lambda x : [''.join([token.text_with_ws for token in doc]) for doc in split_sentences(x)])
emails['body_unquoted_sentences'] = emails.Body_unquoted.map(lambda x : [''.join([token.text_with_ws for token in doc]) for doc in split_sentences(x)])
emails['subject_sentences'] = emails.Subject.map(lambda x : [''.join([token.text_with_ws for token in doc]) for doc in split_sentences(x)])
emails['all_sentences'] = emails['subject_sentences'] + emails['body_sentences']

Extract named entities

In [None]:
def extract_named_entities(x, nlp):
    doc = nlp(x)
    return doc.ents

In [None]:
emails["named_entities_subject"] = emails['Subject'].map(lambda x: extract_named_entities(x, nlp))
emails["named_entities_body"] = emails['Body_clean'].map(lambda x: extract_named_entities(x, nlp))
emails['named_entities'] = emails['named_entities_subject'].astype(str) + emails['named_entities_body'].astype(str) 

Compute emails embedding for the instance discovery baseline.

In [None]:
nlp = spacy.load("en_core_web_sm")

def s2v_sentence_embedding(sentence):
    """
        Embed a sentence using Sense2Vec model.
            sentence: the sentence to be embedded.
    """
    doc = nlp(sentence)
    words_embeddings = []
    for token in doc:
        token_string = str(token.lemma_)
        if token_string:
            token_pos = str(token.pos_)
            word_vector = s2v[str(token_string + "|" + token_pos)]
            if (word_vector is not None): # Some words may not be embeddable.
                words_embeddings.append(word_vector)
    
    words_embeddings = np.array(words_embeddings)
    
    if len(words_embeddings) > 0: # Some whole sentences may not embeddable
        return np.mean(words_embeddings, axis=0)
    else:
        return np.nan
    

def email_embedding(sentences):
    sentences_embeddings = []
    for sentence in sentences:
        sentence_emb = s2v_sentence_embedding(str(sentence))
        if not np.any(np.isnan(sentence_emb)):
            sentences_embeddings.append(sentence_emb)
        
    sentences_embeddings = np.array(sentences_embeddings)
    
    if len(sentences_embeddings) > 0:
        return np.mean(sentences_embeddings, axis=0)
    else:
        return np.nan
    

def s2v_sentence_ne_embedding(sentence):
    """
        Embed a sentence's named entities using Sense2Vec model.
            sentence: the sentence from which NE need to be embedded.
    """
    doc = nlp(sentence)
    words_embeddings = []
    for token in doc.ents:
        token_string = str(token.lemma_)
        if token_string:
            token_pos = str(token.label_)
            if token_pos != "CARDINAL": # Embed only string NE
                word_vector = s2v[str(token_string + "|" + token_pos)]
                if (word_vector is not None): # Some NE may not be embeddable.
                    words_embeddings.append(word_vector)
                    print(token_string)
                    print(token_pos)
                    print(word_vector)
    
    words_embeddings = np.array(words_embeddings)
    
    if len(words_embeddings) > 0: # Some whole sentences's NE may not embeddable
        return np.mean(words_embeddings, axis=0)
    else:
        return np.nan
    

def email_ne_embedding(sentences):
    sentences_embeddings = []
    for sentence in sentences:
        sentence_emb = s2v_sentence_embedding(str(sentence))
        if not np.any(np.isnan(sentence_emb)):
            sentences_embeddings.append(sentence_emb)
        
    sentences_embeddings = np.array(sentences_embeddings)
    
    if len(sentences_embeddings) > 0:
        return np.mean(sentences_embeddings, axis=0)
    else:
        return np.nan

In [None]:
emails["body_embedding"] = emails.body_sentences.map(lambda x: email_embedding(x))
emails["body_unquoted_embedding"] = emails.body_unquoted_sentences.map(lambda x: email_embedding(x))
emails["subject_embedding"] = emails.subject_sentences.map(lambda x: email_embedding(x))
emails['date_embedding'] = pd.to_datetime(emails.Date, utc=True)
emails['date_embedding']  = pd.to_datetime(emails['date_embedding']).astype(int)/ 10**9
emails['named_entities_embedding'] = emails.all_sentences.map(lambda x: email_ne_embedding(x))
emails['named_entities_body_unquoted_embedding'] = emails.body_unquoted_sentences.map(lambda x: email_ne_embedding(x))
emails['named_entities_subject_embedding'] = emails.subject_sentences.map(lambda x: email_ne_embedding(x))

# Impute named entities'embeddings' NaN with average
avg_ne_emb = emails['named_entities_embedding'].mean()
nan_count = 0
for row in emails.loc[emails['named_entities_embedding'].isnull(), 'named_entities_embedding'].index:
    emails.at[row, 'named_entities_embedding'] = avg_ne_emb
    nan_count = nan_count+1 
print("All named entities embeddings imputed with average: ", str(nan_count))

avg_ne_emb = emails['named_entities_body_unquoted_embedding'].mean()
nan_count = 0
for row in emails.loc[emails['named_entities_body_unquoted_embedding'].isnull(), 'named_entities_body_unquoted_embedding'].index:
    emails.at[row, 'named_entities_body_unquoted_embedding'] = avg_ne_emb
    nan_count = nan_count+1 
print("Unquoted body named entities embeddings imputed with average: ", str(nan_count))

avg_ne_emb = emails['named_entities_subject_embedding'].mean()
nan_count = 0
for row in emails.loc[emails['named_entities_subject_embedding'].isnull(), 'named_entities_subject_embedding'].index:
    emails.at[row, 'named_entities_subject_embedding'] = avg_ne_emb
    nan_count = nan_count+1 
print("Subject line named entities embeddings imputed with average: ", str(nan_count))

All named entities embeddings imputed with average:  0
Unquoted body named entities embeddings imputed with average:  0
Subject line named entities embeddings imputed with average:  0


Compute emails embedding for the activity discovery baseline.

# Save and sanity check

In [None]:
emails.to_csv(save_path_final_df, index=False, quoting=csv.QUOTE_ALL)
df = pd.read_csv(save_path_final_df, quoting=csv.QUOTE_ALL)

print(df.shape)

(250, 26)


In [None]:
df[['Email_ID', 'Date', 'From', 'To', 'Subject', 'Body_clean', 'Trace_ID', 'Action']].sort_values('Date')

Unnamed: 0,Email_ID,Date,From,To,Subject,Body_clean,Trace_ID,Action
211,43,2017-04-14 10:15:23-07:00,revathykuberan@gmail.com,['users@camel.apache.org'],Re: Multiple from end points traversing to dif...,Subject: Re: Multiple from end points travers...,9,ask a question
0,1,2017-04-14 10:42:39+00:00,lburgazzoli@apache.org,['commits@camel.apache.org'],camel git commit: HeaderSelectorProducer to su...,Subject: camel git commit: HeaderSelectorProd...,1,commit changes
1,2,2017-04-14 10:52:55+00:00,davsclaus@apache.org,['commits@camel.apache.org'],[1/6] camel git commit: Rename catalog to runt...,Subject: [1/6] camel git commit: Rename catal...,2,commit changes
2,3,2017-04-14 10:52:56+00:00,davsclaus@apache.org,['commits@camel.apache.org'],[2/6] camel git commit: Rename catalog to runt...,Subject: [2/6] camel git commit: Rename catal...,2,commit changes
3,4,2017-04-14 10:52:57+00:00,davsclaus@apache.org,['commits@camel.apache.org'],[3/6] camel git commit: Rename catalog to runt...,Subject: [3/6] camel git commit: Rename catal...,2,commit changes
...,...,...,...,...,...,...,...,...
118,239,2017-04-19 14:21:57+02:00,lburgazzoli@gmail.com,['dev@camel.apache.org'],Re: Camel 2.19 Roadmap,Subject: Re: Camel 2.19 Roadmap To: dev@camel...,24,version release planning
244,242,2017-04-19 14:29:00+02:00,claus.ibsen@gmail.com,['users@camel.apache.org'],Re: Spring Boot > 1.5 fails when camel-swagger...,Subject: Re: Spring Boot > 1.5 fails when cam...,64,ask a question
247,247,2017-04-19 14:35:40+02:00,zoran@regvart.com,['users@camel.apache.org'],Re: Spring Boot > 1.5 fails when camel-swagger...,Subject: Re: Spring Boot > 1.5 fails when cam...,64,provide support
249,249,2017-04-19 15:27:36+02:00,claus.ibsen@gmail.com,['users@camel.apache.org'],Re: Spring Boot > 1.5 fails when camel-swagger...,Subject: Re: Spring Boot > 1.5 fails when cam...,64,ask a question
