In [1]:
import pandas as pd

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from preprocessing.prepare_data import DataPreprocessor


### Loading data

In [2]:
messages_data = pd.read_csv('data/spam.csv', encoding='latin-1')

### Review the data

In [3]:
print('Number of samples: ', messages_data.shape[0])
messages_data.head(3)

Number of samples:  5572


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


### Cleaning messages and extracting vectorized words

In [4]:
data_preprocessor = DataPreprocessor()

X,y = data_preprocessor.get_x_y(messages_data)

In [5]:
print('X shape: ',X.shape)
X.iloc[0]

X shape:  (5572, 7791)


____           0
aa             0
aah            0
aaniye         0
aaooooright    0
              ..
ûªve           0
ûï             0
ûïharry        0
ûò             0
ûówell         0
Name: 0, Length: 7791, dtype: int64

### Splitting data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [7]:
multinomial_nbc = MultinomialNB()
multinomial_nbc.fit(X_train,y_train)


MultinomialNB()

In [8]:
y_pred_multinomial = multinomial_nbc.predict(X_test)
print('MultinomialNB accuracy: ', accuracy_score(y_test, y_pred_multinomial))

MultinomialNB accuracy:  0.9763101220387652


In [9]:
gaussian_nb_clf = GaussianNB()
gaussian_nb_clf.fit(X_train, y_train);

In [10]:
y_pred_gaussian = gaussian_nb_clf.predict(X_test)
print('GaussianNB accuracy: ', accuracy_score(y_test, y_pred_gaussian))

GaussianNB accuracy:  0.8930366116295765


In [11]:
bernoulli_nbc = BernoulliNB()
bernoulli_nbc.fit(X_train,y_train);

In [12]:
y_pred_bernoulli = bernoulli_nbc.predict(X_test)
print('BernoulliNB accuracy: ', accuracy_score(y_test, y_pred_bernoulli))

BernoulliNB accuracy:  0.9784637473079684


Real message test with BernoulliNB classifier:

In [13]:
message = '''I know mrwhatisthis is one of your password on day of hack..
Lets get directly to the point.
Not one person has paid me to check about you.
You do not know me and you're probably thinking why you are getting this email?
in fact, i actually placed a malware on the adult vids (adult porn) website and you know what, you visited this site to experience fun (you know what i mean).

When you were viewing videos, your browser started out operating as a RDP having a key logger which provided me with accessibility to your display and web cam.

immediately after that, my malware obtained every one of your contacts from your Messenger, FB, as well as email account.

after that i created a double-screen video. 1st part shows the video you were viewing (you have a nice taste omg), and 2nd part displays the recording of your cam, and its you.

Best solution would be to pay me $5599.
We are going to refer to it as a donation. in this situation, i most certainly will without delay remove your video.


Bitcoin address: 15Ti1AK3zNntkSJGB8dnJs1rQsJepwF85o

[case SeNSiTiVe, copy & paste it]

You could go on your life like this never happened and you will not ever hear back again from me.

You'll make the payment via Bitcoin (if you do not know this, search 'how to buy bitcoin' in Google).
if you are planning on going to the law, surely, this e-mail can not be traced back to me, because it's hacked too.

I have taken care of my actions. i am not looking to ask you for a lot, i simply want to be paid.


if i do not receive the bitcoin;, i definitely will send out your video recording to all of your contacts including friends and family, co-workers, and so on.


Nevertheless, if i do get paid, i will destroy the recording immediately.

If you need proof, reply with Yeah then i will send out your video recording to your 8 friends.
it's a nonnegotiable offer and thus please don't waste mine time & yours by replying to this message.'''


In [14]:
message

"I know mrwhatisthis is one of your password on day of hack..\nLets get directly to the point.\nNot one person has paid me to check about you.\nYou do not know me and you're probably thinking why you are getting this email?\nin fact, i actually placed a malware on the adult vids (adult porn) website and you know what, you visited this site to experience fun (you know what i mean).\n\nWhen you were viewing videos, your browser started out operating as a RDP having a key logger which provided me with accessibility to your display and web cam.\n\nimmediately after that, my malware obtained every one of your contacts from your Messenger, FB, as well as email account.\n\nafter that i created a double-screen video. 1st part shows the video you were viewing (you have a nice taste omg), and 2nd part displays the recording of your cam, and its you.\n\nBest solution would be to pay me $5599.\nWe are going to refer to it as a donation. in this situation, i most certainly will without delay remove

In [15]:
single_message = pd.DataFrame(columns=['v2'],data=[message])
single_message = data_preprocessor.clean_messages(single_message)
single_message = data_preprocessor.get_tokenized(single_message)

In [16]:
print('Message was classified as: ', bernoulli_nbc.predict(single_message)[0])





Message was classified as:  spam
