In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

This dataset contains a collection of email text messages, labeled as either spam or not spam. Each email message is associated with a binary label, where "1" indicates that the email is spam, and "0" indicates that it is not spam. The dataset is intended for use in training and evaluating spam email classification models.


In [2]:
data = pd.read_csv("emails.csv",encoding="utf8")
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
with open ("NLTK's list of english stopwords",encoding="utf8") as Stopwords:
    stopwords=Stopwords.readlines()

st = [i.replace("\n",'') for i in stopwords]
st[:5]

['i', 'me', 'my', 'myself', 'we']

In [4]:
len (st)

1817

In [5]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Sina's
[nltk_data]     Pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
nltk_stopwords = nltk.corpus.stopwords.words("english")

In [7]:
allstopwords=nltk_stopwords+st
len(allstopwords)

2015

In [8]:
set(allstopwords)

{'',
 'bist',
 'sj',
 'detail',
 'small',
 'herseâ€\x9d',
 'downwards',
 'woulding',
 'havent',
 'nowhere',
 'wherein',
 'beside',
 'good',
 'below',
 'oughtnt',
 'afterwards',
 'inside',
 'nz',
 'hae',
 'parting',
 'each',
 "we're",
 'sg',
 'website',
 'opened',
 'ml',
 'hello',
 'interest',
 'wells',
 'after',
 'co',
 'tj',
 'years',
 'saved',
 'index',
 'greatest',
 'anymore',
 'mn',
 'as',
 'our',
 'certainly',
 'towards',
 'plenties',
 'nonetheless',
 'becomes',
 'seeing',
 'particularly',
 'merely',
 'outside',
 'beginnings',
 'formerly',
 'turning',
 'om',
 'msie',
 'afore',
 'ly',
 'old',
 'fewer',
 'tilled',
 'gs',
 'happens',
 'tried',
 'jo',
 'noone',
 'overaller',
 'oftenest',
 'wish',
 'sa',
 'p',
 'research',
 'whatever',
 'invention',
 'gq',
 'ifs',
 'gf',
 'd',
 'number',
 'though',
 'needing',
 'and',
 'sine',
 'backed',
 'whilst',
 'theyve',
 'something',
 'thises',
 'they',
 'accordingly',
 'forbye',
 'apparently',
 "couldn't",
 'latterly',
 'yet',
 'amongst',
 'form

In [9]:
[allstopwords]

[['a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  "he'd",
  "he'll",
  'her',
  'here',
  'hers',
  'herself',
  "he's",
  'him',
  'himself',
  'his',
  'how',
  'i',
  "i'd",
  'if',
  "i'll",
  "i'm",
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it'd",
  "it'll",
  "it's",
  'its',
  'itself',
  "i've",
  'just',
  'll',
  'm',
  'ma',
  'me',
  'mightn',
  "mightn't",
  'more',
  'most',
  'mustn',
  "mustn't",
  'my',
  'mys

In [10]:
len(allstopwords)

2015

In [11]:
dataset = pd.DataFrame(columns=["body","category"])

In [12]:

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to C:\Users\Sina's
[nltk_data]     Pc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Sina's
[nltk_data]     Pc/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
for idx,row in data.iterrows():
    body = row["text"]
    body_token =word_tokenize(body)
    body_token_filter = [w for w in body_token if w not in allstopwords]
    
    dataset.loc[idx] = {
        'body' : ' '.join(body_token_filter),
        'category'  : str(row["spam"]).replace("\n","")
    }

In [14]:
dataset.head()

Unnamed: 0,body,category
0,: naturally irresistible corporate identity ha...,1
1,: stock trading gunslinger fanny merrill muzo ...,1
2,: unbelievable homes easy homeowner pre - appr...,1
3,: 4 color printing special request additional ...,1
4,": money , software cds ! software compatibilit...",1


In [15]:
vect = TfidfVectorizer()
vect.fit(dataset["body"])

In [16]:
X=vect.transform(dataset["body"])
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 402834 stored elements and shape (5728, 36191)>

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y=le.fit_transform(dataset["category"])
y

array([1, 1, 1, ..., 0, 0, 0], shape=(5728,))

In [18]:
le.classes_

array(['0', '1'], dtype=object)

In [19]:

X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2)

In [20]:
svmc = svm.SVC()
svmc.fit(X_train,Y_train)

In [21]:
y_=svmc.predict(X_test)

In [22]:
report=classification_report(Y_test,y_)
print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       863
           1       0.99      0.93      0.96       283

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146



In [23]:
confusion_matrix(Y_test,y_)

array([[861,   2],
       [ 20, 263]])

In [24]:
svmc.score(X_test,Y_test)

0.9808027923211169

In [25]:
new_email="""
Staff Picks
Hi there. When it comes to showing customers that you care, there’s no “right” way. Sometimes you let them know with words. Sometimes it’s with numbers. 

And sometimes, it can be as simple as convenient options at the right moment. Let’s check out some ways you can delight customers without extra effort.
"""

In [26]:

body = new_email
body_token =word_tokenize(body)
body_token_filter = [w for w in body_token if w not in allstopwords]
body_token_clear = ' '.join(body_token_filter)
list_new_email=[body_token_clear]
X_new=vect.transform(list_new_email)
res=svmc.predict(X_new)
if res == 1 :
    print("It is Spam!")
else:
    print("It is Not Spam!")


It is Not Spam!
