# Table of Contents
### 1. Loading the Dataset
### 2. Pre-processing the Dataset
### 3. Feature Engineering and Model Building
> ##### a. Creating Meta Features
> ##### b. Counting Nouns and Verbs
> ##### c. Model Building for Meta Features
> ##### d. Tf-Idf Features
> ##### e. Model Building for Complete Feature Set

In [None]:
import pandas as pd
import numpy as np
import string

In [None]:
# load dataset
data = pd.read_csv('spamdata.csv')
data.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# two class is present: spam and ham
print(data['label'].unique())

['ham' 'spam']


In [None]:
print(data['label'].value_counts())
print(data['label'].value_counts(normalize=True))

ham     4825
spam     747
Name: label, dtype: int64
ham     0.865937
spam    0.134063
Name: label, dtype: float64


## 2. Pre-Processing the dataset

In [None]:
# sample text
# ham
print(data['text'][0])
print('\n')
# spam
print(data['text'][2])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [None]:
# sample document 
cleaned = data['text'][0].lower()
cleaned

'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

In [None]:
# pre-initialize list of punctuations
punctuations = string.punctuation
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# remove punctutation from sample text
cleaned = "".join(character for character in cleaned if character not in punctuations)
cleaned

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [None]:
# import spacy
from spacy.lang.en import English
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# spacy document
my_doc = nlp(cleaned)

In [None]:
# create list of word tokens 
token_list = []
for token in my_doc:
  token_list.append(token.text)
token_list

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [None]:
# import stop words
from spacy.lang.en.stop_words import STOP_WORDS

# create list of word tokens after removing stopwords
filtered_sentence = []
for word in token_list:
  # get token text
  lexeme = nlp.vocab[word]
  # check if stopword or not
  if lexeme.is_stop==False:
    filtered_sentence.append(word)

filtered_sentence
# stop word is removed 

['jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [None]:
# joining the tokenised words in sample documents
cleaned = filtered_sentence
cleaned = " ".join(cleaned)
cleaned

'jurong point crazy available bugis n great world la e buffet cine got amore wat'

### Converting it in the form of function

In [None]:
# preprocessing function
def clean_text(text):
    ## lower case 
    cleaned = text.lower()
    
    ## remove punctuations
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)
    
    ## remove stopwords 
    my_doc = nlp(cleaned)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    
    ## remove stop words
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)
    
    ## Store cleaned document
    cleaned = filtered_sentence
    cleaned = " ".join(cleaned)
    
    return cleaned  

In [None]:
# applying the preprocessing function
data['cleaned'] = data['text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


## 3. Feature Engineering and Model Building.

### a. Creating Meta Features

> 1. Number of words in original text
> 2. Number of words in cleaned text
> 3. Number of characters including spaces in the cleaned text
> 4. Number of characters excluding spaces in the cleaned text
> 5. Number of digits in the cleaned text

In [None]:
# creating meta features

# number of words in original text
data['word_count'] = data['text'].apply(lambda x: len(x.split()))
# number of words in cleaned text
data['word_count_cleaned'] = data['cleaned'].apply(lambda x: len(x.split()))
# Number of characters including spaces in the cleaned text
data["char_count"] = data["cleaned"].apply(lambda x : len(x))
# number of characters excluding spaces in the cleaned text
data["char_count_without_spaces"] = data["cleaned"].apply(lambda x : len(x.replace(" ","")))

# Number of digits in the cleaned text
data["num_dig"] = data["cleaned"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))


In [None]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_dig
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


### b. Counting Nouns and Verbs

In [None]:
# import spacy
import spacy
nlp = spacy.load('en_core_web_sm')
document = nlp(data['cleaned'][0])

document

jurong point crazy available bugis n great world la e buffet cine got amore wat

In [None]:
# POS tags
all_tags=[]
for w in document:
  all_tags.append(w.tag_)
all_tags

['NNP',
 'NNP',
 'NNP',
 'JJ',
 'NNP',
 'CC',
 'JJ',
 'NNP',
 'NNP',
 'NNP',
 'NNP',
 'NNP',
 'VBD',
 'RB',
 'NN']

In [None]:
# dictionary of noun and verb POS tags
pos_dic = {"noun":['NNP', 'NN','NNS','NNPS'],'verb':["VBZ", "VB", "VBD","VBG", "VBN"]}

In [None]:
# Sample document: Noun count
count = 0
for tag in all_tags:
    if tag in pos_dic['noun']:
        count += 1


In [None]:
count

10

In [None]:
# Function for noun and verb counts
def pos_check(txt, family):

    # spacy document
    txt = nlp(txt)
    
    all_tags = []

    # Get pos tag
    for w in txt:
        all_tags.append(w.tag_)
    
    count = 0

    # Count number of nouns and verbs
    for tag in all_tags:
        if tag in pos_dic[family]:
            count += 1

    return count

In [None]:
# Sample document: Verb count
pos_check("They are playing in the ground", "verb")

1

In [None]:
# Sample document: Noun count
pos_check("They are playing in the ground", "noun")

1

In [None]:
# Applying the function
data["noun_count"] = data["cleaned"].apply(lambda x : pos_check(x, "noun"))
data["verb_count"] = data["cleaned"].apply(lambda x : pos_check(x, "verb"))


In [None]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleaned,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0,10,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0,5,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3,12,1
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0,5,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0,1,2


### c. Model Building for Meta Features

In [None]:
# label encoding target variable
from sklearn.preprocessing import LabelEncoder

target = data['label'].values
target = LabelEncoder().fit_transform(target)


In [None]:
target

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
# List of features
train = data[['word_count', 'word_count_cleaned', 'char_count', 
              'char_count_without_spaces', 'num_dig', 'noun_count', 
              'verb_count']]

In [None]:
# Train- Validation Split
from sklearn.model_selection import train_test_split as tts
x_train, x_valid, y_train, y_valid = tts(train, target, random_state=20, stratify=target)


In [None]:
# Train and Validatio dataset
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4179, 7), (4179,)), ((1393, 7), (1393,)))

In [None]:
# naive bayes
from sklearn import naive_bayes

In [None]:
# multinomial naive bayes
model = naive_bayes.MultinomialNB()

In [None]:
# Fit model on training data
model.fit(x_train,y_train)

MultinomialNB()

In [None]:
# prediction on training data
pred_train = model.predict(x_train)

pred_valid = model.predict(x_valid)


In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

In [None]:
# training accuracy
accuracy_score(y_train,pred_train)

0.9420914094280929

In [None]:
# validation accuracy
accuracy_score(y_valid,pred_valid)

0.9375448671931084

Hence accuracy score of training data is is 94%.

And accuracy score of test(unknown) data is 93%

Let's see if we can improve it by doing tf-idf feature.

In [None]:
# Import Tf-Idf Vectoriser
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Tf-idf Vectoriser
word_tfidf = TfidfVectorizer(max_features=500)

word_tfidf.fit(data['cleaned'].values)

TfidfVectorizer(max_features=500)

In [None]:
# Transform
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)
word_vectors_tfidf

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 21920 stored elements in Compressed Sparse Row format>

In [None]:
# Combining meta features and Tf-Idf features
from scipy.sparse import hstack, csr_matrix


In [None]:
# List of meta features
meta_features = ['word_count', 'word_count_cleaned',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']
  
# Meta features
feature_set1 = data[meta_features]

In [None]:
# combined features
train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)],"csr")

In [None]:
# Train and Validation datasets
x_train, x_valid, y_train, y_valid = tts(train, target, random_state=20, stratify=target)


In [None]:
# Multinomial Naive Bayes Model
model = naive_bayes.MultinomialNB()

In [None]:
# Fit model
model.fit(x_train, y_train)

MultinomialNB()

In [None]:
# Predcition on Training data
pred_train = model.predict(x_train)
# Predcition on Validation data
pred_valid = model.predict(x_valid)

In [None]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9676956209619526

In [None]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9612347451543432