# Classification with Linear Regression

Experimental notebook for classification of phishing emails using basic Linear Regression

In [29]:
# Install requirements - uncomment this line the first time you run this notebook
#!pip install -r ./requirements.txt

In [30]:
import pandas as pd
import numpy as np
from functools import reduce
from tqdm import tqdm
# from .autonotebook import tqdm as notebook_tqdm
import itertools

In [31]:
spam_col_names = ['id', 'Body', 'Label']

In [32]:
dfSA = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/completeSpamAssassin.csv', names=spam_col_names)

In [33]:
dfEnron = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/enronSpamSubset.csv', names=spam_col_names)

In [34]:
dfLing = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/lingSpam.csv', names=spam_col_names)

In [35]:
#merge datasets
dfs = [dfSA, dfEnron, dfLing]
dfSpam = pd.concat(dfs).dropna(axis=0)

In [36]:
dfSpam.head()

Unnamed: 0,id,Body,Label
1.0,0.0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
2.0,1.0,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3.0,2.0,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
4.0,3.0,##############################################...,1
5.0,4.0,I thought you might like these:\n1) Slim Down ...,1


# Preprocessing

In avoidance of 'self-plagiarisation', much of the code for the `Dataset` class is adapted from my Text Processing Sentiment Analysis assignment.

In [37]:
#nltk
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords as NLTK_STOP
import string

In [38]:
#download wordnet for lemmatization
#uncomment appropriate line if you get error: "Resource wordnet not found.", "Resource punkt not found.", etc...

# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

In [39]:
#spacy imports
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOP

#download spacy dataset
#uncomment the line below if you get error "Can't find model 'en_core_web_sm'"
# ! python -m spacy download en_core_web_sm

nlp = spacy.load('en_core_web_sm')

In [47]:
class Dataset:
    def __init__(self, df, preprocessing=["lower"], feature_selection=["alltokens"]):
        
        #read preprocessing and feature_selection configuration
        self.preprocessing = preprocessing
        self.feature_selection = feature_selection
        self.data = df

        # === define various processors and regexes for various preprocessing/feature selection methods ===

        # NLTK Stemming Engine
        self.porter = PorterStemmer()

        #NLTK Lemmatizing Engine
        self.wn_lt = WordNetLemmatizer()

        #tokenizers

        #words regex - splits on word boundaries, doesn't include punctuation etc
        self.word_tokenizer = RegexpTokenizer(r'\w+')

        #create tokenizer based on NLTK-provided regex from Labs
        nltk_pat = r'''(?x) # set flag to allow verbose regexps
            (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
            | \w+(?:-\w+)* # words with optional internal hyphens
            | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
            | \.\.\. # ellipsis
            | [][.,;"'?():_`-]
            | [>]?[:;][\']?[\(\)\[\]]+ # these are separate tokens; includes ], [
            '''
        self.nltk_tokenizer = RegexpTokenizer(nltk_pat)

        #create tokenizer based on custom regex based on above, with less features
        custom_pat = r'''(?x)
                \w+(?:-\w+)*
                |\$?\d+(?:\.\d+)?%?
                |\.\.\.'''
        self.custom_tokenizer = RegexpTokenizer(custom_pat)

        # stop words - SPACY_STOP defined above in imports

        self.NLTK_ENGLISH_STOP = set(NLTK_STOP.words('english'))

        # === apply preprocessing and feature selection ===

        self.process_phrases()
    
    def preprocess_phrase(self, phrase):
        """define preprocessing function for phrases
        can call any number of these options - however, some may not combine well"""

        if self.preprocessing == []:
            #no preprocessing
            return phrase
        if "lower" in self.preprocessing:
            #lowercase
            phrase = phrase.lower()
        if "newlines" in self.preprocessing:
            phrase = " ".join(filter(None, phrase.split("\n")))
        if "punc" in self.preprocessing:
            #remove punctuation
            phrase = phrase.translate(str.maketrans('','',string.punctuation))
        if "stemming" in self.preprocessing:
            #use NLTK stemming
            phrase = self.porter.stem(phrase)
        if "nltk_lemmatize" in self.preprocessing:
            #use NLTK's lemmatization method
            new_phrase = ""
            words = nltk.word_tokenize(phrase)
            for word in words:
                new_phrase += self.wn_lt.lemmatize(word)
            phrase = new_phrase
        if "spacy_lemmatize" in self.preprocessing:
            #use spacy's lemmatization method
            nlp_phrase = nlp(phrase)
            new_phrase = ""
            for token in nlp_phrase:
                new_phrase += (token.lemma_ + " ")
            phrase = new_phrase

        return phrase

    def extract_features_from_phrase(self, phrase):
        """define feature extraction function for phrases
        extracts all words from all phrases as features for document set
        for each method, if one has already been applied the phrase must be treated as a list"""

        #list of negation words from 
        negation_words = ["neither", "never", "no", "nobody", "none", "noone", "nor", "not", "nothing", "nowhere"]

        intensifer_words = ["absolutely", "completely", "extremely", "highly", "rather", "really", "so", "too", "totally", "utterly", "very"]

        #use if len(self.feature_selection) > 1 to check if any preprocessing already occurred, as input will be in a list

        if self.feature_selection == []:
            #we have to do some feature selection, absolute minimum is alltokens
            return phrase.split(" ")
        if "alltokens" in self.feature_selection:
            #split based on whitespace
            phrase = phrase.split(" ")
        if "nltk_tokenize" in self.feature_selection:
            #tokenize using NLTK tokenizer, with words regex
            if len(self.feature_selection) > 1:
                phrase =  list(itertools.chain.from_iterable([self.word_tokenizer.tokenize(partial_phrase) for partial_phrase in phrase]))
            else:
                phrase = self.word_tokenizer.tokenize(phrase)
        if "nltk_tokenize_2" in self.feature_selection:
            #tokenize using NLTK's regex
            if len(self.feature_selection) > 1:
                phrase =  list(itertools.chain.from_iterable([self.nltk_tokenizer.tokenize(partial_phrase) for partial_phrase in phrase]))
            else:
                phrase = self.nltk_tokenizer.tokenize(phrase)
        if "custom_tokenize" in self.feature_selection:
            #tokenize with custom regex
            if len(self.feature_selection) > 1:
                phrase =  list(itertools.chain.from_iterable([self.custom_tokenizer.tokenize(partial_phrase) for partial_phrase in phrase]))
            else:
                phrase = self.custom_tokenizer.tokenize(phrase)
        if "nltk_stoplist" in self.feature_selection:
            #use an NLTK stoplist
            if len(self.feature_selection) > 1:
                phrases = []
                for partial_phrase in phrase:
                    phrases +=  [word for word in partial_phrase.split(" ") if word not in self.NLTK_ENGLISH_STOP]
                phrase = list(itertools.chain.from_iterable(phrases))
            else:
                phrase = [word for word in phrase.split(" ") if word not in self.NLTK_ENGLISH_STOP]
        if "spacy_stoplist" in self.feature_selection:
            #use a spacy stoplist
            if len(self.feature_selection) > 1:
                phrases = []
                for partial_phrase in phrase:
                    phrases +=  [word for word in partial_phrase.split(" ") if word not in list(SPACY_STOP)]
                phrase = list(itertools.chain.from_iterable(phrases))
            else:
                phrase = [word for word in phrase.split(" ") if word not in list(SPACY_STOP)]
        if "nltk_pos_tag" in self.feature_selection:
            #use POS tagging - must be a list, so split by spaces if not already
            if isinstance(phrase, list):
                phrase = nltk.pos_tag(phrase)
            else:
                phrase = phrase.split(" ")
                phrase = nltk.pos_tag(phrase)
        if "negation_bigrams" in self.feature_selection:
            #create bigrams with negation words + their successors
            if len(self.feature_selection) > 1:
                phrases = []
                for j in range(0, len(phrase)):
                    partial_phrase = phrase[j]
                    split = partial_phrase.split(" ")

                    for i in range(0, len(split)):
                        if split[i] in negation_words and i != len(split) - 1:
                            #add the negation word and its successor to the list of phrases
                            phrases.append(split[i] + " " + split[i+1])
                        else:
                            #skip to next partial if negation is at end of partial phrase
                            if split[i] in negation_words and j != len(phrase) - 1:
                                # print("Next partial")
                                phrases.append(split[i] + " " + phrase[j+1].split(" ")[0])
                            else:
                                #skip entirely if this is the last partial phrase
                                phrases.append(split[i])
                phrase = phrases
            else:
                split = phrase.split(" ")
                phrase = []
                for i in range(0, len(split)):
                    if split[i] in negation_words and i != len(split) - 1:
                        phrase.append(split[i] + " " + split[i+1])
                    else:
                        phrase.append(split[i])
        if "intensifier_bigrams" in self.feature_selection:
            #create bigrams with intensifier words + their successors
            if len(self.feature_selection) > 1:
                phrases = []
                for j in range(0, len(phrase)):
                    partial_phrase = phrase[j]
                    split = partial_phrase.split(" ")

                    for i in range(0, len(split)):
                        if split[i] in intensifer_words and i != len(split) - 1:
                            #add the intensifier word and its successor to the list of phrases
                            phrases.append(split[i] + " " + split[i+1])
                        else:
                            #skip to next partial if intensifier is at end of partial phrase
                            if split[i] in intensifer_words and  j != len(phrase) - 1:
                                phrases.append(split[i] + phrase[j+1].split(" ")[0])
                            else:
                                #skip entirely if this is the last partial phrase
                                phrases.append(split[i])
                phrase = phrases
            else:
                split = phrase.split(" ")
                phrase = []
                for i in range(0, len(split)):
                    if split[i] in intensifer_words and i != len(split) - 1:
                        phrase.append(split[i] + " " + split[i+1])
                    else:
                        phrase.append(split[i])

        return phrase

    def process_phrases(self):
        """extract bodies"""

        self.bodies = self.data['Body']
        
        #apply preprocessing function to all phrases using list comprehension
        self.preprocessed_phrases = [self.preprocess_phrase(phrase) for phrase in self.data['Body']]
        self.features = [self.extract_features_from_phrase(phrase) for phrase in self.preprocessed_phrases]

        data = {'id': self.data['id'], 'ppBody': self.features, 'label': self.data['Label']}
        self.pp_df = pd.DataFrame(data=data)

In [48]:
d = Dataset(dfSpam, preprocessing=["lower","newlines"])
d.pp_df.head()

Unnamed: 0,id,ppBody,label
1.0,0.0,"[save, up, to, 70%, on, life, insurance., why,...",1
2.0,1.0,"[1), fight, the, risk, of, cancer!, http://www...",1
3.0,2.0,"[1), fight, the, risk, of, cancer!, http://www...",1
4.0,3.0,[#############################################...,1
5.0,4.0,"[i, thought, you, might, like, these:, 1), sli...",1


# Comments



# Todo

- [x] Remove NaN id from merged `dfSpam`
- [ ] Tokenise body data