In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_csv('data/spam.csv',encoding='latin-1')

In [4]:
df['v2'] = [str.lower(x) for x in df['v2']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"go until jurong point, crazy.. available only ...",,,
1,ham,ok lar... joking wif u oni...,,,
2,spam,free entry in 2 a wkly comp to win fa cup fina...,,,
3,ham,u dun say so early hor... u c already then say...,,,
4,ham,"nah i don't think he goes to usf, he lives aro...",,,


In [7]:
source = df['v2']
type(source)

pandas.core.series.Series

In [8]:
source.head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [9]:
target = df['v1']
type(target)

pandas.core.series.Series

In [10]:
target.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

In [11]:
# 0:ham, 1:spam
target = pd.get_dummies(target,drop_first=True)
target.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [12]:
cv = CountVectorizer(stop_words='english')

In [13]:
cv.fit(source)

CountVectorizer(stop_words='english')

In [14]:
cv.vocabulary_

{'jurong': 4224,
 'point': 5741,
 'crazy': 2271,
 'available': 1271,
 'bugis': 1703,
 'great': 3534,
 'world': 8227,
 'la': 4349,
 'buffet': 1701,
 'cine': 1994,
 'got': 3494,
 'amore': 1051,
 'wat': 8026,
 'ok': 5343,
 'lar': 4385,
 'joking': 4192,
 'wif': 8134,
 'oni': 5369,
 'free': 3265,
 'entry': 2875,
 'wkly': 8185,
 'comp': 2110,
 'win': 8146,
 'fa': 3005,
 'cup': 2329,
 'final': 3121,
 'tkts': 7519,
 '21st': 411,
 '2005': 402,
 'text': 7388,
 '87121': 784,
 'receive': 6115,
 'question': 6010,
 'std': 7028,
 'txt': 7701,
 'rate': 6062,
 'apply': 1128,
 '08452810075over18': 77,
 'dun': 2738,
 'say': 6450,
 'early': 2757,
 'hor': 3815,
 'nah': 5092,
 'don': 2651,
 'think': 7443,
 'goes': 3458,
 'usf': 7837,
 'lives': 4535,
 'freemsg': 3272,
 'hey': 3732,
 'darling': 2386,
 'week': 8071,
 'word': 8218,
 'like': 4485,
 'fun': 3323,
 'tb': 7323,
 'xxx': 8292,
 'chgs': 1948,
 'send': 6536,
 '50': 607,
 'rcv': 6074,
 'brother': 1674,
 'speak': 6910,
 'treat': 7634,
 'aids': 985,
 'pate

In [15]:
cv_transformed = cv.transform(source)
cv_array = cv_transformed.toarray()

In [16]:
cv_array.shape

(5572, 8404)

In [17]:
from scipy import sparse

In [18]:
a0 = sparse.csr_matrix(cv_array[0])
print(a0)

  (0, 1051)	1
  (0, 1271)	1
  (0, 1701)	1
  (0, 1703)	1
  (0, 1994)	1
  (0, 2271)	1
  (0, 3494)	1
  (0, 3534)	1
  (0, 4224)	1
  (0, 4349)	1
  (0, 5741)	1
  (0, 8026)	1
  (0, 8227)	1


In [19]:
# Import tfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Instantiate TfidfVectorizer
tv = TfidfVectorizer(max_features=500, stop_words='english')
tv

TfidfVectorizer(max_features=500, stop_words='english')

In [20]:
tv.fit_transform(source)

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [21]:
print(tv.get_feature_names())

['000', '10', '100', '1000', '10p', '11', '12hrs', '150', '150p', '150ppm', '16', '18', '1st', '2000', '250', '2nd', '50', '500', '5000', '750', '800', '8007', '86688', 'able', 'abt', 'account', 'actually', 'address', 'aft', 'afternoon', 'ah', 'aight', 'alright', 'amp', 'angry', 'answer', 'apply', 'ard', 'ask', 'asked', 'attempt', 'auction', 'available', 'await', 'award', 'awarded', 'away', 'awesome', 'babe', 'baby', 'bad', 'beautiful', 'bed', 'believe', 'best', 'better', 'big', 'birthday', 'bit', 'bonus', 'book', 'bored', 'box', 'boy', 'bring', 'brother', 'bt', 'bus', 'busy', 'buy', 'called', 'calling', 'calls', 'came', 'camera', 'car', 'care', 'cash', 'cause', 'chance', 'change', 'charge', 'chat', 'check', 'chikku', 'choose', 'claim', 'class', 'close', 'club', 'code', 'collect', 'collection', 'colour', 'com', 'come', 'comes', 'coming', 'congrats', 'contact', 'cool', 'cos', 'cost', 'coz', 'cs', 'customer', 'da', 'dad', 'dat', 'date', 'day', 'days', 'dear', 'decimal', 'delivery', 'den'

In [22]:
tv.vocabulary_

{'available': 42,
 'great': 177,
 'world': 484,
 'got': 175,
 'wat': 461,
 'ok': 301,
 'lar': 224,
 'wif': 472,
 'free': 157,
 'entry': 143,
 'win': 475,
 'final': 150,
 'text': 405,
 'receive': 345,
 'question': 338,
 'txt': 441,
 'rate': 340,
 'apply': 36,
 'dun': 135,
 'say': 360,
 'early': 137,
 'don': 125,
 'think': 413,
 'goes': 170,
 'hey': 199,
 'week': 465,
 'word': 480,
 'like': 238,
 'fun': 162,
 'xxx': 489,
 'send': 367,
 '50': 16,
 'brother': 65,
 'speak': 390,
 'set': 371,
 'friends': 159,
 'network': 287,
 'customer': 105,
 'selected': 366,
 'prize': 334,
 'claim': 86,
 'code': 90,
 'valid': 448,
 'hours': 206,
 'mobile': 276,
 '11': 5,
 'update': 444,
 'latest': 227,
 'colour': 93,
 'camera': 74,
 'gonna': 173,
 'home': 202,
 'soon': 387,
 'want': 458,
 'talk': 402,
 'stuff': 398,
 'tonight': 430,
 've': 449,
 'today': 423,
 'cash': 77,
 '100': 2,
 '000': 0,
 'pounds': 330,
 'cost': 102,
 '150p': 8,
 'day': 110,
 '16': 10,
 'reply': 347,
 'urgent': 446,
 'won': 478,
 'w

In [23]:
tv_transformed = tv.transform(source)
tv_array = tv_transformed.toarray()

In [24]:
tv_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
tv_df = pd.DataFrame(tv_array,columns=tv.get_feature_names()).add_prefix('TFIDF_')

In [26]:
tv_df.head()

Unnamed: 0,TFIDF_000,TFIDF_10,TFIDF_100,TFIDF_1000,TFIDF_10p,TFIDF_11,TFIDF_12hrs,TFIDF_150,TFIDF_150p,TFIDF_150ppm,...,TFIDF_ya,TFIDF_yeah,TFIDF_year,TFIDF_years,TFIDF_yes,TFIDF_yesterday,TFIDF_yo,TFIDF_yup,TFIDF_ì_,TFIDF_ìï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
examine_row = tv_df.iloc[0]
print(examine_row.sort_values(ascending=False).head())

TFIDF_available    0.549238
TFIDF_world        0.496702
TFIDF_wat          0.410286
TFIDF_great        0.405632
TFIDF_got          0.344604
Name: 0, dtype: float64


In [28]:
# Tfidf & N-grams

In [29]:
tv_bi_gram_vec = TfidfVectorizer(ngram_range=(1,2),stop_words='english')
tv_bi_gram = tv_bi_gram_vec.fit_transform(source)

In [30]:
print(tv_bi_gram_vec.get_feature_names())



In [31]:
tv_bi_gram_vec.vocabulary_

{'jurong': 16991,
 'point': 24832,
 'crazy': 7794,
 'available': 3587,
 'bugis': 5262,
 'great': 13756,
 'world': 36201,
 'la': 17808,
 'buffet': 5257,
 'cine': 6501,
 'got': 13476,
 'amore': 2805,
 'wat': 34993,
 'jurong point': 16992,
 'point crazy': 24833,
 'crazy available': 7796,
 'available bugis': 3589,
 'bugis great': 5264,
 'great world': 13820,
 'world la': 36214,
 'la buffet': 17809,
 'buffet cine': 5258,
 'cine got': 6507,
 'got amore': 13482,
 'amore wat': 2806,
 'ok': 23058,
 'lar': 17917,
 'joking': 16908,
 'wif': 35615,
 'oni': 23306,
 'ok lar': 23122,
 'lar joking': 17927,
 'joking wif': 16911,
 'wif oni': 35628,
 'free': 12068,
 'entry': 10658,
 'wkly': 35884,
 'comp': 7173,
 'win': 35687,
 'fa': 11066,
 'cup': 7965,
 'final': 11538,
 'tkts': 32392,
 '21st': 989,
 '2005': 966,
 'text': 31311,
 '87121': 1918,
 'receive': 26189,
 'question': 25689,
 'std': 29971,
 'txt': 33275,
 'rate': 25885,
 'apply': 3072,
 '08452810075over18': 177,
 'free entry': 12107,
 'entry wkly

In [32]:
# Create a DataFrame with the Count features
tv_df_tfidf = pd.DataFrame(tv_bi_gram.toarray(), columns=tv_bi_gram_vec.get_feature_names()).add_prefix('Count_')

In [33]:
tv_df_tfidf.head()

Unnamed: 0,Count_00,Count_00 easter,Count_00 sub,Count_00 subs,Count_000,Count_000 bonus,Count_000 cash,Count_000 homeowners,Count_000 pounds,Count_000 price,...,Count_ûò entertaining,Count_ûò especially,Count_ûò favour,Count_ûò getting,Count_ûò hope,Count_ûò limping,Count_ûò sound,Count_ûò stick,Count_ûówell,Count_ûówell û_
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
tv_sums = tv_df_tfidf.sum()

In [35]:
tv_sums

Count_00            1.540984
Count_00 easter     0.168684
Count_00 sub        0.838168
Count_00 subs       0.346519
Count_000           4.507586
                      ...   
Count_ûò limping    0.212502
Count_ûò sound      0.313509
Count_ûò stick      0.171044
Count_ûówell        0.271743
Count_ûówell û_     0.271743
Length: 37249, dtype: float64

In [36]:
print(tv_sums.head())

Count_00           1.540984
Count_00 easter    0.168684
Count_00 sub       0.838168
Count_00 subs      0.346519
Count_000          4.507586
dtype: float64


# **Dont using feature (CountVectorizer)

In [37]:
df.head(2)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"go until jurong point, crazy.. available only ...",,,
1,ham,ok lar... joking wif u oni...,,,


In [38]:
X = df['v2'].values # Convert to array
y = df['v1'].values # Convert to array
X = [str.lower(x) for x in X]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [39]:
# Model 1: Naive Bayes

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [41]:
cv = CountVectorizer(max_features=500) # feature that accure more than one time

In [42]:
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.fit_transform(X_test).toarray()
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
clf = MultinomialNB()
clf

MultinomialNB()

In [44]:
clf.fit(X_train,y_train)

MultinomialNB()

In [45]:
y_pred = clf.predict(X_test)

In [46]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[840 109]
 [119  47]]
0.7955156950672646
              precision    recall  f1-score   support

         ham       0.88      0.89      0.88       949
        spam       0.30      0.28      0.29       166

    accuracy                           0.80      1115
   macro avg       0.59      0.58      0.59      1115
weighted avg       0.79      0.80      0.79      1115



# **Using feature (CountVectorizer)

In [47]:
tv_df.head()

Unnamed: 0,TFIDF_000,TFIDF_10,TFIDF_100,TFIDF_1000,TFIDF_10p,TFIDF_11,TFIDF_12hrs,TFIDF_150,TFIDF_150p,TFIDF_150ppm,...,TFIDF_ya,TFIDF_yeah,TFIDF_year,TFIDF_years,TFIDF_yes,TFIDF_yesterday,TFIDF_yo,TFIDF_yup,TFIDF_ì_,TFIDF_ìï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
df_ = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1)
df_.head(2)

Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...


In [49]:
df_train = pd.concat([df_,tv_df],axis=1,sort=False)
df_train.head()

Unnamed: 0,v1,v2,TFIDF_000,TFIDF_10,TFIDF_100,TFIDF_1000,TFIDF_10p,TFIDF_11,TFIDF_12hrs,TFIDF_150,...,TFIDF_ya,TFIDF_yeah,TFIDF_year,TFIDF_years,TFIDF_yes,TFIDF_yesterday,TFIDF_yo,TFIDF_yup,TFIDF_ì_,TFIDF_ìï
0,ham,"go until jurong point, crazy.. available only ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ham,ok lar... joking wif u oni...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ham,u dun say so early hor... u c already then say...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ham,"nah i don't think he goes to usf, he lives aro...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# 0:ham, 1:spam
df_train['v1_'] = pd.get_dummies(df_train['v1'],drop_first=True)
df_train.head()
columns = df_train.columns
columns = columns.drop(['v1','v2','v1_'])
X = df_train[columns].values # Convert to array
y = df_train['v1_'].values # Convert to array

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [51]:
clf = MultinomialNB()
clf

MultinomialNB()

In [52]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[940   9]
 [ 26 140]]
0.968609865470852
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       949
           1       0.94      0.84      0.89       166

    accuracy                           0.97      1115
   macro avg       0.96      0.92      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [53]:
# Du ma ~97% ?????????????????? Tfidf is the best