In [7]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [8]:

dataset = pd.read_csv('spam.csv', encoding='latin-1')

dataset

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:
dataset.iloc[2]['v2']

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [10]:
def clean_message(msg):
    msg = msg.lower().strip()
    msg = re.sub(r'[\W]', ' ', msg)
    msg = re.sub(r'\s+', ' ', msg)
    return msg

dataset['v2'] = dataset['v2'].apply(clean_message)

In [11]:
numer_of_unique_words = pd.Series(np.concatenate(dataset['v2'].apply(str.split).to_numpy())).nunique()

vect = CountVectorizer(max_features=numer_of_unique_words)
tokenized_words = vect.fit_transform(dataset['v2']).toarray()

In [12]:
X = pd.DataFrame(columns=vect.get_feature_names(),data=tokenized_words)
y = dataset['v1']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=15)

In [14]:
multinomial_nbc = MultinomialNB()
multinomial_nbc.fit(X_train,y_train)


MultinomialNB()

In [15]:
y_pred_multinomial = multinomial_nbc.predict(X_test)
print('MultinomialNB: ', accuracy_score(y_test, y_pred_multinomial))

MultinomialNB:  0.9770279971284996


In [16]:
gaussian_nb_clf = GaussianNB()
gaussian_nb_clf.fit(X_train, y_train);

In [17]:
y_pred_gaussian = gaussian_nb_clf.predict(X_test)
print('GaussianNB: ', accuracy_score(y_test, y_pred_gaussian))

GaussianNB:  0.9009332376166547


In [18]:
bernoulli_nbc = BernoulliNB()
bernoulli_nbc.fit(X_train,y_train);

In [19]:
y_pred_bernoulli = bernoulli_nbc.predict(X_test)
print('BernoulliNB: ', accuracy_score(y_test, y_pred_bernoulli))

BernoulliNB:  0.9798994974874372


In [20]:
linearSVC = LinearSVC()
linearSVC.fit(X_train,y_train);

In [24]:
y_pred_svc = linearSVC.predict(X_test)
print('LinearSVC: ', accuracy_score(y_test, y_pred_svc))

LinearSVC:  0.9834888729361091


Real message test:

In [25]:
message = '''I know mrwhatisthis is one of your password on day of hack..
Lets get directly to the point.
Not one person has paid me to check about you.
You do not know me and you're probably thinking why you are getting this email?
in fact, i actually placed a malware on the adult vids (adult porn) website and you know what, you visited this site to experience fun (you know what i mean).

When you were viewing videos, your browser started out operating as a RDP having a key logger which provided me with accessibility to your display and web cam.

immediately after that, my malware obtained every one of your contacts from your Messenger, FB, as well as email account.

after that i created a double-screen video. 1st part shows the video you were viewing (you have a nice taste omg), and 2nd part displays the recording of your cam, and its you.

Best solution would be to pay me $5599.
We are going to refer to it as a donation. in this situation, i most certainly will without delay remove your video.


Bitcoin address: 15Ti1AK3zNntkSJGB8dnJs1rQsJepwF85o

[case SeNSiTiVe, copy & paste it]

You could go on your life like this never happened and you will not ever hear back again from me.

You'll make the payment via Bitcoin (if you do not know this, search 'how to buy bitcoin' in Google).
if you are planning on going to the law, surely, this e-mail can not be traced back to me, because it's hacked too.

I have taken care of my actions. i am not looking to ask you for a lot, i simply want to be paid.


if i do not receive the bitcoin;, i definitely will send out your video recording to all of your contacts including friends and family, co-workers, and so on.


Nevertheless, if i do get paid, i will destroy the recording immediately.

If you need proof, reply with Yeah then i will send out your video recording to your 8 friends.
it's a nonnegotiable offer and thus please don't waste mine time & yours by replying to this message.'''
message = clean_message(message)
message

'i know mrwhatisthis is one of your password on day of hack lets get directly to the point not one person has paid me to check about you you do not know me and you re probably thinking why you are getting this email in fact i actually placed a malware on the adult vids adult porn website and you know what you visited this site to experience fun you know what i mean when you were viewing videos your browser started out operating as a rdp having a key logger which provided me with accessibility to your display and web cam immediately after that my malware obtained every one of your contacts from your messenger fb as well as email account after that i created a double screen video 1st part shows the video you were viewing you have a nice taste omg and 2nd part displays the recording of your cam and its you best solution would be to pay me 5599 we are going to refer to it as a donation in this situation i most certainly will without delay remove your video bitcoin address 15ti1ak3znntksjgb

Prediction:

In [26]:
linearSVC.predict(vect.transform([message]).toarray())[0]

'spam'