# A notebook on the Eron Dataset from Kaggle
This notebook was found through the following [link](https://www.kaggle.com/code/conniedeng/nlp-eron-dataset?fbclid=IwAR3k6TfBRz842eBrj3l3pOY9a3qSiO3r1JqhI2UeLCx9slJU4RrQvrt-D0w).

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Extracting email's body and subject from Eron dataset into Pandas dataframe

* new dataset
* seperate headers
* https://www.kaggle.com/nagasai524/spam-email-classification-using-word2vec

In [None]:
# Input data files are available in the "../input/" directory.
filepath = "/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv"

# Read the data into a pandas dataframe called emails
emails=pd.read_csv("/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv")

print("Successfully loaded {} rows and {} columns!".format(emails.shape[0], emails.shape[1]))
emails.head()

I have noticed that these emails are all lowercase; I've looked at some other data sets as well (least the ones that have spam/ham labeles and they also seem to be lowercase)

I honestly think having caps would be amazing because I'm sure spam emails include a ton more caps - but uh yea

In [None]:
def get_email_subject(email):
    subject = email[0:email.find('\r\n')]
    subject = subject. replace('Subject: ', '')
    return subject

def get_email_body(email):
    body = email[email.find('\r\n')+2:]
    return body

In [None]:
# cleaning of columns
email_df = emails.drop(['Unnamed: 0', "label_num"], axis = 1)

# get the subject and body of email
email_df["subject"] = email_df["text"].apply(lambda x: get_email_subject(x))
email_df["body"] = email_df["text"].apply(lambda x: get_email_body(x))

# ridding of the text column (unless we need it)
email_df = email_df.drop(["text"], axis = 1)

email_df

# expand default pandas display options to make emails more clearly visible when printed
pd.set_option('display.max_colwidth', 200)

# from here email_df is our dataframe
email_df.head() # you could do print(bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

# Text/Data Pre-processing

In [None]:
# hyperparameters 
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

**Tokenization** (Maybe we will have multiple tokenization methods; you can put how you wana tokenize down here)

In [None]:
# Tokenization method 1
# this is tokenization split by white sapce
def tokenize_1(row):
    if row is None or row is '':
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

In [None]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize

In [None]:
# Tokenization method 2
# split of white space AND punctuation $3.88 --> '3', '.', '88'
def tokenize_2(row):
    return wordpunct_tokenize(str(row))[:maxtokens]

**Regular Expression to remove  unnecessary characters** (removing \n new lines, symbols?, this could also include links)

In [None]:
import re

# this covers lower() tokens
def reg_expressions(row):
    row = re.sub(r'[\r\n]', "", row)
    return row

**Stop-word removal** (removing unimportant words)


In [None]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:10])

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    return token

**Stemming** (removing endings of words, -ing, -ly...)

In [None]:
def stemming(row):
    port_stemmer = nltk.stem.porter.PorterStemmer()
    token = [port_stemmer.stem(token) for token in row]
    return token

**Lemmatization** (convert into root word)

In [None]:
def lemmatization(row):
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    token = [lem.lemmatize(token) for token in row]
    return token

**Final utility in preprocessing data connecting all these preprocessing techniques**

In [None]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_tokenize=1,flg_stemm=False, flg_lemm=True, flg_stopwords=True):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = reg_expressions(text)

    ## Tokenize (convert from string to list)
    if flg_tokenize == 1:
        text = tokenize_1(text)

    elif flg_tokenize == 2:
        text = tokenize_2(text)
    
    # remove Stopwords
    if flg_stopwords == True:
        text = stop_word_removal(text)
        
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        text = stemming(text)
        
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        text = lemmatization(text)
            
    ## back to string from list
    text = " ".join(text)
    return text

In [None]:
email_df["text_clean"] = email_df["body"].apply(lambda x: utils_preprocess_text(x, flg_tokenize=2, flg_stemm=True, flg_lemm=True, flg_stopwords=True))
email_df

# Getting Training and Test Set

In [None]:
import seaborn as sns
sns.countplot(x="label",data=email_df,order=['spam','ham'])

In [None]:
email_df["label"].value_counts()

The ratio between spam and ham is **1499:3672** in the complete dataset. We will maintain this ratio between spam and ham for the training and test dataset.

We will also split the dataset into a 80%:20% where the training set will be 80% and the test set will be 20%

In [None]:
from sklearn.model_selection import train_test_split

# random_state 0 makes sure that the data split is consistently the same (so the random sampling does not keep changing)
train, test = train_test_split(email_df, test_size=0.20, stratify=email_df["label"], random_state=0)

**Training data set**

In [None]:
sns.countplot(x="label",data=train, order=['spam','ham'])
print(train["label"].value_counts())

In [None]:
sns.countplot(x="label",data=test, order=['spam','ham'])
print(test["label"].value_counts())

In [None]:
email_train_df = train
email_test_df = test

In [None]:
email_train_df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')

count_wm = countvectorizer.fit_transform(email_train_df["text_clean"])
tfidf_wm = tfidfvectorizer.fit_transform(email_train_df["text_clean"])

count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()


df_countvect = pd.DataFrame(data = count_wm.toarray(),columns = count_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
