# Imports


In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk
from nltk.corpus import stopwords
import re
import string

# Reading data


In [3]:
DATA_PATH = "../data/emails.csv"
data = pd.read_csv(DATA_PATH)

In [4]:
data

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


# Preprocessing

Suggested preprocessing steps:

*   For text column:
    -   Remove non-alphabets
    -   separate the subject part from the mail body
    -   check if there is a re: in the subject (denotes that the mail is a reply to a previous mail)
    -   convert all to lowercase
    -   lemmatize
    -   remove stopwords and unicode characters


In [5]:
def startstrip(text: str, subtext: str):
    if text.startswith(subtext):
        text = text[len(subtext) :]
    return text


def clean_text(text: str):
    # convert text to lower case to make it easier to do the preprocessing
    text = text.lower()

    # remove the "subject :" from the beginning
    text = startstrip(text, "subject: ")
    text = text.lstrip("subject: ")

    # finding if the mail was a reply to another mail
    is_reply = 1 if text.startswith("re :") else 0
    text = startstrip(text, "re : ")

    # cleaning the text
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"[^\x00-\x7F]+", "", text)

    stopwords_list = stopwords.words("english")
    text = " ".join(word for word in text.split() if word not in stopwords_list)

    return text, is_reply

In [6]:
cleaned_data = pd.DataFrame(
    data["text"].apply(lambda x: clean_text(x)).tolist(), columns=["text", "is_reply"]
)
cleaned_data["spam"] = data["spam"]

## Saving the preprocessed dataset


In [8]:
cleaned_data.to_csv("../data/cleaned-data.csv", index=False)