In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd 
data =  pd.read_csv('/content/drive/My Drive/SMSSpamCollection.csv', sep = '\t', names = ['label', 'message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
text = data['message']
label = data['label']

In [5]:
#Number of Words
#x = lambda a : a + 10
#print(x(5))
data['word_count'] = data['message'].apply(lambda x: len(str(x).split(" ")))
data[['message','word_count']].head()

Unnamed: 0,message,word_count
0,"Go until jurong point, crazy.. Available only ...",20
1,Ok lar... Joking wif u oni...,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,U dun say so early hor... U c already then say...,11
4,"Nah I don't think he goes to usf, he lives aro...",13


In [6]:
#Number of characters
data['char_count'] = data['message'].str.len() ## this also includes spaces
data[['message','char_count']].head()

Unnamed: 0,message,char_count
0,"Go until jurong point, crazy.. Available only ...",111
1,Ok lar... Joking wif u oni...,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,U dun say so early hor... U c already then say...,49
4,"Nah I don't think he goes to usf, he lives aro...",61


In [7]:
#Average Word Length
def avg_word(sentence):
  words = sentence.split()
  #print(words)
  return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['message'].apply(lambda x: avg_word(x))
data[['message','avg_word']].head()

Unnamed: 0,message,avg_word
0,"Go until jurong point, crazy.. Available only ...",4.6
1,Ok lar... Joking wif u oni...,4.0
2,Free entry in 2 a wkly comp to win FA Cup fina...,4.571429
3,U dun say so early hor... U c already then say...,3.545455
4,"Nah I don't think he goes to usf, he lives aro...",3.769231


In [8]:
#Number of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['stopwords'] = data['message'].apply(lambda x: len([x for x in x.split() if x in stop]))
data[['message','stopwords']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,message,stopwords
0,"Go until jurong point, crazy.. Available only ...",4
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,5
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",5


In [9]:
#Number of special characters
data['hastags'] = data['message'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data[['message','hastags']].head()

Unnamed: 0,message,hastags
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
#Number of numerics
data['numerics'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[['message','numerics']].head()

Unnamed: 0,message,numerics
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
#Number of Uppercase words
data['upper'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data[['message','upper']].head()

Unnamed: 0,message,upper
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",1


In [13]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
from textblob import TextBlob, Word, Blobber
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

data['noun_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'noun'))
data['verb_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'verb'))
data['adj_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adj'))
data['adv_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adv'))
data['pron_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'pron'))
data[['message','noun_count','verb_count','adj_count', 'adv_count', 'pron_count' ]].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,message,noun_count,verb_count,adj_count,adv_count,pron_count
0,"Go until jurong point, crazy.. Available only ...",9,1,3,3,0
1,Ok lar... Joking wif u oni...,4,1,1,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,14,3,4,0,0
3,U dun say so early hor... U c already then say...,3,3,2,3,0
4,"Nah I don't think he goes to usf, he lives aro...",1,5,0,3,3


In [14]:
data[['message','word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count','label' ]].head()

Unnamed: 0,message,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,noun_count,verb_count,adj_count,adv_count,pron_count,label
0,"Go until jurong point, crazy.. Available only ...",20,111,4.6,4,0,0,0,9,1,3,3,0,ham
1,Ok lar... Joking wif u oni...,6,29,4.0,0,0,0,0,4,1,1,0,0,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,28,155,4.571429,5,0,2,2,14,3,4,0,0,spam
3,U dun say so early hor... U c already then say...,11,49,3.545455,2,0,0,2,3,3,2,3,0,ham
4,"Nah I don't think he goes to usf, he lives aro...",13,61,3.769231,5,0,0,1,1,5,0,3,3,ham


In [15]:
features = data[['word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count']]

#label = data['label']

import numpy as np
classes_list = ["ham","spam"]
label_index = data['label'].apply(classes_list.index)
label = np.asarray(label_index)

import numpy as np
features_array = np.asarray(features)

features_array.shape

(5572, 12)

In [16]:
# data split into train and text
import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_array, label, test_size=0.33, random_state=42)

In [17]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.svm import SVC
model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))



from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train,y_train)
y_pred_naive = naive.predict(x_test)
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))

SVM
Accuracy score = 0.933115823817292
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1593
           1       0.83      0.63      0.71       246

    accuracy                           0.93      1839
   macro avg       0.89      0.80      0.84      1839
weighted avg       0.93      0.93      0.93      1839

Naive Bayes
Accuracy score = 0.9320282762370854
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1593
           1       0.72      0.80      0.76       246

    accuracy                           0.93      1839
   macro avg       0.84      0.88      0.86      1839
weighted avg       0.94      0.93      0.93      1839



In [18]:
# data split into train and text
import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_array, label, test_size=0.33, random_state=42)

x_train.shape


(3733, 12)

In [22]:
data = pd.read_csv('/content/drive/My Drive/SMSSpamCollection.csv', sep = '\t', names = ['label','message'])

text = data['message']
class_label = data['label']

import numpy as np
classes_list = ["ham","spam"]
label_index = class_label.apply(classes_list.index)
label = np.asarray(label_index)

import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.33, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,1))
x_train = vectorizer.fit_transform(X_train)
x_test = vectorizer.transform(X_test)

x_train.shape

(3733, 7082)

In [23]:
vectorizer.get_feature_names()

['00',
 '000',
 '000pes',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '07801543489',
 '07808247860',
 '07815296484',
 '07821230901',
 '07880867867',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700435505150p',
 '08700621170150p',
 '08701213186',
 '087012373

In [24]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.svm import SVC
model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))


from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train.toarray(),y_train)
y_pred_naive = naive.predict(x_test.toarray())
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))

SVM
Accuracy score = 0.9869494290375204
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1593
           1       1.00      0.90      0.95       246

    accuracy                           0.99      1839
   macro avg       0.99      0.95      0.97      1839
weighted avg       0.99      0.99      0.99      1839

Naive Bayes
Accuracy score = 0.9059271343121261
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      1593
           1       0.60      0.91      0.72       246

    accuracy                           0.91      1839
   macro avg       0.79      0.91      0.83      1839
weighted avg       0.93      0.91      0.91      1839

