In [1]:
import pandas as pd
import pickle

from nltk.corpus import stopwords
import re
import spacy
import gensim
from gensim.utils import simple_preprocess

stop_words = stopwords.words("english")
stop_words.extend(
    [
        "from",
        "subject",
        "re",
        "edu",
        "use",
        "not",
        "would",
        "say",
        "could",
        "_",
        "be",
        "know",
        "good",
        "go",
        "get",
        "do",
        "done",
        "try",
        "many",
        "some",
        "nice",
        "thank",
        "think",
        "see",
        "rather",
        "easy",
        "easily",
        "lot",
        "lack",
        "make",
        "want",
        "seem",
        "run",
        "need",
        "even",
        "right",
        "line",
        "even",
        "also",
        "may",
        "take",
        "come",
        "com",
        "http",
        "mail",
        "pm",
    ]
)


### Load data


In [10]:
df = pd.read_csv(r"../data/raw_mail_all.csv")


### Cleaning dataset


date format


In [11]:
df["date"] = pd.to_datetime(df["date"], infer_datetime_format=True)


bag of words


In [12]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub("\S*@\S*\s?", "", sent)  # remove emails
        sent = re.sub("\s+", " ", sent)  # remove newline chars
        sent = re.sub("'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
        yield (sent)


data = df.body.values.tolist()
data_words = list(sent_to_words(data))


In [15]:
dt = pickle.dump(data, open("../data/data.pickle", "wb"))

In [13]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(
    texts, stop_words=stop_words, allowed_postags=["NOUN", "ADJ", "VERB"]
):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        )
    # remove stopwords once more after lemmatization
    texts_out = [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts_out
    ]
    return texts_out


data_ready = process_words(data_words)


In [14]:
pickle.dump(data_ready, open("../data/clean_words.pickle", "wb"))
