# Classification with Linear Regression

Experimental notebook for classification of phishing emails using basic Linear Regression

In [54]:
# Install requirements - uncomment this line the first time you run this notebook
#!pip install -r ./requirements.txt

In [46]:
import pandas as pd
import numpy as np
from functools import reduce

In [47]:
spam_col_names = ['id', 'Body', 'Label']

In [48]:
dfSA = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/completeSpamAssassin.csv', names=spam_col_names)

In [49]:
dfEnron = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/enronSpamSubset.csv', names=spam_col_names)

In [50]:
dfLing = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/lingSpam.csv', names=spam_col_names)

In [51]:
#merge datasets
dfs = [dfSA, dfEnron, dfLing]
dfSpam = pd.concat(dfs).dropna(axis=0)

In [52]:
dfSpam.head()

Unnamed: 0,id,Body,Label
1.0,0.0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
2.0,1.0,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3.0,2.0,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
4.0,3.0,##############################################...,1
5.0,4.0,I thought you might like these:\n1) Slim Down ...,1


# Preprocessing

In avoidance of 'self-plagiarisation', much of the code for the `Dataset` class is adapted from my Text Processing Sentiment Analysis assignment.

In [53]:
#nltk
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords as NLTK_STOP
import string

In [56]:
#download wordnet for lemmatization
#uncomment appropriate line if you get error: "Resource wordnet not found.", "Resource punkt not found.", etc...

# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

In [57]:
#spacy imports
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOP

#download spacy dataset
#uncomment the line below if you get error "Can't find model 'en_core_web_sm'"
#! spacy download en_core_web_sm

nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
class Dataset:
    def __init__(self, df, preprocessing=["lower"], feature_selection=["alltokens"]):
        
        #read preprocessing and feature_selection configuration
        self.preprocessing = preprocessing
        self.feature_selection = feature_selection

        # === define various processors and regexes for various preprocessing/feature selection methods ===

        # NLTK Stemming Engine
        self.porter = PorterStemmer()

        #NLTK Lemmatizing Engine
        self.wn_lt = WordNetLemmatizer()

        #tokenizers

        #words regex - splits on word boundaries, doesn't include punctuation etc
        self.word_tokenizer = RegexpTokenizer(r'\w+')

        #create tokenizer based on NLTK-provided regex from Labs
        nltk_pat = r'''(?x) # set flag to allow verbose regexps
            (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)* # words with optional internal hyphens
            | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
            | \.\.\. # ellipsis
            | [][.,;"'?():_`-]
            | [>]?[:;][\']?[\(\)\[\]]+ # these are separate tokens; includes ], [
            '''
        self.nltk_tokenizer = RegexpTokenizer(nltk_pat)

        #create tokenizer based on custom regex based on above, with less features
        custom_pat = r'''(?x)
                \w+(?:-\w+)*
                |\$?\d+(?:\.\d+)?%?
                |\.\.\.'''
        self.custom_tokenizer = RegexpTokenizer(custom_pat)

        # stop words - SPACY_STOP defined above in imports

        self.NLTK_ENGLISH_STOP = set(NLTK_STOP.words('english'))

        # === apply preprocessing and feature selection ===

        self.process_phrases()

In [None]:
def preprocess_phrase(self, phrase):
    """define preprocessing function for phrases
    can call any number of these options - however, some may not combine well"""

    if self.preprocessing == []:
        #no preprocessing
        return phrase
    if "lower" in self.preprocessing:
        #lowercase
        phrase = phrase.lower()
    if "punc" in self.preprocessing:
        #remove punctuation
        phrase = phrase.translate(str.maketrans('','',string.punctuation))
    if "stemming" in self.preprocessing:
        #use NLTK stemming
        phrase = self.porter.stem(phrase)
    if "nltk_lemmatize" in self.preprocessing:
        #use NLTK's lemmatization method
        new_phrase = ""
        words = nltk.word_tokenize(phrase)
        for word in words:
            new_phrase += self.wn_lt.lemmatize(word)
        phrase = new_phrase
    if "spacy_lemmatize" in self.preprocessing:
        #use spacy's lemmatization method
        nlp_phrase = nlp(phrase)
        new_phrase = ""
        for token in nlp_phrase:
            new_phrase += (token.lemma_ + " ")
        phrase = new_phrase

    return phrase

# Comments



# Todo

- [x] Remove NaN id from merged `dfSpam`
- [ ] Tokenise body data