In [71]:
import pandas as pd
import string

In [72]:
data = pd.read_csv(r"C:\Users\sejal\OneDrive\Desktop\spamdata.csv")
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
# distribution of class
data['label'].value_counts(normalize=True)

label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

In [6]:
#preprocessing the datasets

In [7]:
# Sample document: lowercase
cleaned = data['text'][0].lower()

In [8]:
cleaned

'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

In [9]:
punctuations = string.punctuation #preinitialised list of punctuations

In [10]:
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
cleaned = "".join(character for character in cleaned if character not in punctuations)    

In [12]:
cleaned

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [13]:
# Import spacy
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

In [14]:
# spacy document
my_doc = nlp(cleaned)

In [15]:
# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)

In [16]:
# Import stop words
from spacy.lang.en.stop_words import STOP_WORDS

In [17]:
# Create list of word tokens after removing stopwords
filtered_sentence =[] 

# Iterate over the tokens
for word in token_list:
    # Get token text
    lexeme = nlp.vocab[word]
    # Check if stopword or not
    if lexeme.is_stop == False:
        filtered_sentence.append(word) 

# Print tokens and filtered sentence
print(token_list)
print(filtered_sentence)
cleaned = filtered_sentence

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [18]:
cleaned = " ".join(cleaned)
cleaned

'jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [19]:
def clean_text(text):
    ## lower case 
    cleaned = text.lower()
    
    ## remove punctuations
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)
    
    ## remove stopwords 
    my_doc = nlp(cleaned)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    
    ## remove stop words
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)
    
    ## Store cleaned document
    cleaned = filtered_sentence
    cleaned = " ".join(cleaned)
    
    return cleaned

In [20]:
# Applying the preprocessin function
data["cleaned"] = data["text"].apply(lambda x : clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


In [22]:
#feature engineering work rn

In [23]:
# Creating meta features

# Number of words in original text
data["word_count"] = data["text"].apply(lambda x : len(x.split()))
# Number of words in cleaned text
data["word_count_cleand"] = data["cleaned"].apply(lambda x : len(x.split()))

# Number of characters including spaces in the cleaned text
data["char_count"] = data["cleaned"].apply(lambda x : len(x))
# number of characters excluding spaces in the cleaned text
data["char_count_without_spaces"] = data["cleaned"].apply(lambda x : len(x.replace(" ","")))

# Number of digits in the cleaned text
data["num_dig"] = data["cleaned"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))

In [24]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


In [25]:
#counting noun and verb.

In [26]:
# Import spacy English language model
import spacy
nlp = spacy.load("en_core_web_sm")

In [27]:
# spacy document
document = nlp(data['cleaned'][0])

In [28]:
document

jurong point crazy available bugis n great world la e buffet cine got amore wat

In [29]:
# POS tags
all_tags = []
for w in document:
    all_tags.append(w.tag_)

In [31]:
all_tags

['NNP',
 'NNP',
 'NNP',
 'JJ',
 'NNP',
 'CC',
 'JJ',
 'NN',
 'NNP',
 'NNP',
 'NNP',
 'NNP',
 'VBD',
 'NNP',
 'NNP']

In [33]:
# Dictionary of noun and verb POS tags
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}

In [34]:
# Sample document: Noun count
count = 0
for tag in all_tags:
    if tag in pos_dic['noun']:
        count += 1

In [35]:
count

11

In [36]:
# Function for noun and verb counts
def pos_check(txt, family):

    # spacy document
    txt = nlp(txt)
    
    all_tags = []

    # Get pos tag
    for w in txt:
        all_tags.append(w.tag_)
    
    count = 0

    # Count number of nouns and verbs
    for tag in all_tags:
        if tag in pos_dic[family]:
            count += 1

    return count

In [37]:
# Sample document: Noun count
pos_check("They are playing in the ground", "verb")

1

In [38]:
# Sample document: Noun count
pos_check("They are playing in the ground", "noun")

1

In [43]:
# Applying the function
data["noun_count"] = data["cleaned"].apply(lambda x : pos_check(x, "noun"))
data["verb_count"] = data["cleaned"].apply(lambda x : pos_check(x, "verb"))

In [44]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0,11,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0,5,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3,12,0
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0,6,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0,3,2


In [40]:
#model building for meta features

In [45]:
# Label encoding target variable
from sklearn.preprocessing import LabelEncoder 

target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [46]:
# List of features
train = data[['word_count', 'word_count_cleand', 'char_count', 
              'char_count_without_spaces', 'num_dig', 'noun_count', 
              'verb_count']]

In [47]:
# Train-Validation split
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target)

In [48]:
# Train and Validatio dataset
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4179, 7), (4179,)), ((1393, 7), (1393,)))

In [50]:
# Naive bayes
from sklearn import naive_bayes

In [51]:
# Multinomial naive bayes
model = naive_bayes.MultinomialNB()

In [52]:
# Fit model on training data
model.fit(x_train, y_train)

In [53]:
# Prediction on training data
pred_train = model.predict(x_train)
# Prediction on validation data
pred_valid = model.predict(x_valid)

In [54]:
# Accuracy
from sklearn.metrics import accuracy_score

In [55]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9440057430007178

In [56]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9382627422828428

In [57]:
#improving accuracy by using tf-df features.

In [58]:
# Import Tf-Idf Vectoriser
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
# Tf-Idf Vectoriser
word_tfidf = TfidfVectorizer(max_features=500)

In [60]:
# Fit Tf-Idf Vectoriser
word_tfidf.fit(data["cleaned"].values)

In [61]:
# Transform
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [62]:
word_vectors_tfidf

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 21920 stored elements in Compressed Sparse Row format>

In [63]:
# Combining meta features and Tf-Idf features
from scipy.sparse import hstack, csr_matrix

# List of meta features
meta_features = ['word_count', 'word_count_cleand',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']

# Meta features
feature_set1 = data[meta_features]

# Combined features
train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")

In [64]:
# Train and Validation datasets
x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target)

In [65]:
# Train and Validation datasets
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4179, 507), (4179,)), ((1393, 507), (1393,)))

In [66]:
# Multinomial Naive Bayes Model
model = naive_bayes.MultinomialNB()

In [67]:
model.fit(x_train, y_train)

In [68]:
# Predcition on Training data
pred_train = model.predict(x_train)
# Predcition on Validation data
pred_valid = model.predict(x_valid)

In [69]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9693706628379996

In [70]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9605168700646087