# **Bag of Words model implementationm**

In [91]:
import pandas as pd
messages = pd.read_csv('Data/smsspamclassification',sep='\t',names=['label','message'])

In [92]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [93]:
messages.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

## Data Cleaning And Preprocessingm

In [94]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Pavilion\OneDriv
[nltk_data]     e\Desktop\UdemyMLCourse\venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps =PorterStemmer()
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [96]:
corpus=[]
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [wl.lemmatize(word,pos='v') for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [97]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine get amore wat',
 'ok lar joke wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf live around though']

In [98]:
## Create the bag of word model
from sklearn.feature_extraction.text import CountVectorizer
# For binary Bag of word make sure the parameter binary=True
cv = CountVectorizer(max_features=200, binary=True) # Out of all word in the dataset take 2500 words

X = cv.fit_transform(corpus).toarray()

In [99]:
X.shape

(5572, 200)

In [100]:
X[:5][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)

## **N-Grams**

In [101]:
cv.vocabulary_ # Top 100 words

{'go': 56,
 'great': 59,
 'get': 54,
 'wat': 184,
 'ok': 119,
 'free': 50,
 'win': 189,
 'st': 154,
 'may': 97,
 'text': 163,
 'txt': 175,
 'dun': 38,
 'say': 138,
 'already': 0,
 'think': 167,
 'live': 87,
 'around': 5,
 'hey': 70,
 'week': 187,
 'word': 191,
 'back': 9,
 'like': 85,
 'still': 156,
 'send': 140,
 'even': 42,
 'speak': 152,
 'per': 122,
 'friends': 52,
 'customer': 30,
 'prize': 133,
 'claim': 20,
 'call': 14,
 'mobile': 104,
 'co': 22,
 'gonna': 57,
 'home': 73,
 'soon': 150,
 'want': 183,
 'talk': 161,
 'stuff': 158,
 'tonight': 173,
 'today': 170,
 'cash': 17,
 'cost': 27,
 'day': 32,
 'days': 33,
 'reply': 136,
 'urgent': 178,
 'www': 194,
 'right': 137,
 'thank': 164,
 'take': 160,
 'help': 69,
 'time': 169,
 'use': 180,
 'next': 112,
 'message': 100,
 'com': 23,
 'oh': 118,
 'watch': 185,
 'name': 109,
 'yes': 198,
 'make': 94,
 'fine': 46,
 'way': 186,
 'feel': 44,
 'dont': 36,
 'miss': 103,
 'ur': 177,
 'try': 174,
 'first': 48,
 'da': 31,
 'finish': 47,
 'lor'

In [102]:
len(cv.vocabulary_)

200

In [103]:
## Create the bag of word model with ngram
from sklearn.feature_extraction.text import CountVectorizer
# For binary Bag of word make sure the parameter binary=True
cv = CountVectorizer(max_features=2500, binary=True,ngram_range=(1,3)) # Out of all word in the dataset take 2500 words

X = cv.fit_transform(corpus).toarray()

In [104]:
cv.vocabulary_

{'go': 815,
 'point': 1631,
 'crazy': 454,
 'available': 106,
 'bugis': 213,
 'great': 862,
 'world': 2437,
 'la': 1093,
 'cine': 344,
 'get': 773,
 'wat': 2339,
 'ok': 1489,
 'lar': 1106,
 'joke': 1062,
 'wif': 2395,
 'free': 714,
 'entry': 611,
 'wkly': 2425,
 'comp': 397,
 'win': 2401,
 'cup': 470,
 'final': 678,
 'st': 1982,
 'may': 1276,
 'text': 2089,
 'receive': 1725,
 'question': 1691,
 'std': 1998,
 'txt': 2192,
 'rate': 1705,
 'apply': 70,
 'free entry': 723,
 'entry wkly': 614,
 'std txt': 1999,
 'rate apply': 1706,
 'free entry wkly': 725,
 'std txt rate': 2000,
 'dun': 578,
 'say': 1813,
 'early': 585,
 'already': 45,
 'nah': 1399,
 'think': 2106,
 'usf': 2272,
 'live': 1175,
 'around': 80,
 'though': 2116,
 'think go': 2109,
 'freemsg': 736,
 'hey': 928,
 'week': 2359,
 'word': 2432,
 'back': 130,
 'like': 1160,
 'fun': 755,
 'still': 2001,
 'xxx': 2469,
 'send': 1853,
 'even': 621,
 'brother': 205,
 'speak': 1968,
 'treat': 2168,
 'per': 1552,
 'request': 1758,
 'set': 1

In [105]:
y = pd.get_dummies(messages['label']).astype('int')

In [106]:
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [107]:
y['ham'].value_counts()

ham
1    4825
0     747
Name: count, dtype: int64

In [108]:
y.iloc[:,0].values.shape

(5572,)

In [109]:
y = y.iloc[:,0].values

In [110]:
y.shape

(5572,)

In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [112]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [113]:
y_preds = spam_detect_model.predict(X_test)

In [114]:
from sklearn.metrics import classification_report, accuracy_score

In [115]:
accuracy_score(y_test,y_preds)

0.9847533632286996

In [116]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94       149
           1       0.99      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500,ngram_range=(1,2))

X = tv.fit_transform(corpus).toarray()

X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [118]:
tv.vocabulary_

{'go': 816,
 'point': 1620,
 'crazy': 449,
 'available': 110,
 'bugis': 224,
 'great': 858,
 'world': 2436,
 'la': 1103,
 'cine': 347,
 'get': 778,
 'wat': 2346,
 'ok': 1483,
 'lar': 1114,
 'joke': 1072,
 'wif': 2402,
 'oni': 1504,
 'free': 724,
 'entry': 605,
 'wkly': 2425,
 'comp': 398,
 'win': 2407,
 'fa': 645,
 'cup': 464,
 'final': 682,
 'st': 1965,
 'may': 1272,
 'text': 2077,
 'receive': 1716,
 'question': 1676,
 'std': 1980,
 'txt': 2194,
 'rate': 1691,
 'apply': 78,
 'free entry': 729,
 'entry wkly': 607,
 'std txt': 1981,
 'txt rate': 2198,
 'rate apply': 1692,
 'dun': 568,
 'say': 1798,
 'early': 575,
 'already': 51,
 'nah': 1394,
 'think': 2100,
 'usf': 2271,
 'live': 1180,
 'around': 89,
 'though': 2110,
 'think go': 2103,
 'freemsg': 738,
 'hey': 927,
 'darling': 482,
 'week': 2371,
 'word': 2432,
 'back': 129,
 'like': 1167,
 'fun': 759,
 'still': 1984,
 'xxx': 2463,
 'send': 1835,
 'even': 618,
 'brother': 215,
 'speak': 1946,
 'treat': 2171,
 'per': 1555,
 'request': 1

In [119]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [120]:
from sklearn.naive_bayes import MultinomialNB
spam_tfidv_model = MultinomialNB().fit(X_train, y_train)

In [121]:
y_preds = spam_tfidv_model.predict(X_test)

In [122]:
from sklearn.metrics import classification_report, accuracy_score

In [123]:
accuracy_score(y_test,y_preds)

0.9829596412556054

In [82]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93       149
           1       0.98      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

