In [27]:
import pandas as pd
import numpy as np

**Loading the dataset into the pandas data frame**

In [28]:
df = pd.read_csv('spam.csv')

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Checking how many Spam and Non-Spam values this specific dataset contains**

In [29]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

Creating a new column in the DataFrame called 'spam'. In this new column, it places a 1 if the corresponding value in the 'Category' column is 'spam', and a 0 otherwise.

In [30]:
df['spam'] = df['Category'].apply(lambda x:1 if x == 'spam' else 0)

**Below you can see there is an extra column in the DataFrame called 'spam'. That is placing a 1 if the corresponding value in the 'Category' column is 'spam', and a 0 otherwise.**

In [31]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


**Splitting the data using scikit-learn where x is the data in message column and y is the spam column**

**We split the data into a 80/20 split where 20% is testing data and 80% is training data**

In [32]:
from sklearn.model_selection import train_test_split
X_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [33]:
type(X_train)

pandas.core.series.Series

In [34]:
X_train[:4]

1513    Hey sweet, I was wondering when you had a mome...
2097                                          I'm done...
5456    For the most sparkling shopping breaks from 45...
4344    Its a valentine game. . . send dis msg to all ...
Name: Message, dtype: object

In [35]:
type(y_train)

pandas.core.series.Series

In [36]:
y_train[:5]

1513    0
2097    0
5456    1
4344    0
4377    1
Name: spam, dtype: int64

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
type(X_train.values)

numpy.ndarray

In [66]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59050 stored elements and shape (4457, 7742)>

In [67]:
X_train_np = X_train_cv.toarray()

In [68]:
X_train_cv.shape

(4457, 7742)

**The code bellow shows a bunch ogf words that the vocabulary has right now**

In [51]:
cv.get_feature_names_out()[1000:1050]

array(['approaches', 'approaching', 'appropriate', 'approve', 'approx',
       'apps', 'appt', 'appy', 'april', 'aproach', 'apt', 'aquarius',
       'ar', 'arab', 'arabian', 'arcade', 'archive', 'ard', 'are', 'area',
       'aren', 'arent', 'arestaurant', 'areyouunique', 'argentina',
       'argh', 'argue', 'argument', 'arguments', 'aries', 'arise',
       'arithmetic', 'arm', 'armand', 'arms', 'arng', 'arngd', 'arnt',
       'around', 'aroundn', 'arr', 'arrange', 'arranging', 'arrested',
       'arrival', 'arrive', 'arrow', 'arsenal', 'art', 'artists'],
      dtype=object)

**Below you can see we have a total of 7742 words in the vocabulary**

In [52]:
cv.get_feature_names_out().shape

(7742,)

**Here are all the words in the vocabulary:**

In [58]:
cv.vocabulary_

{'hey': 3416,
 'sweet': 6651,
 'was': 7392,
 'wondering': 7583,
 'when': 7483,
 'you': 7703,
 'had': 3286,
 'moment': 4553,
 'if': 3592,
 'might': 4463,
 'come': 1888,
 'to': 6927,
 'me': 4399,
 'want': 7378,
 'send': 6012,
 'file': 2821,
 'someone': 6303,
 'but': 1551,
 'it': 3754,
 'won': 7579,
 'go': 3149,
 'over': 5009,
 'yahoo': 7668,
 'for': 2924,
 'them': 6825,
 'because': 1251,
 'their': 6823,
 'connection': 1947,
 'sucks': 6577,
 'remember': 5687,
 'set': 6038,
 'up': 7181,
 'that': 6813,
 'page': 5034,
 'and': 928,
 'download': 2409,
 'the': 6817,
 'format': 2942,
 'disc': 2321,
 'could': 2010,
 'tell': 6752,
 'how': 3514,
 'do': 2351,
 'or': 4953,
 'know': 3950,
 'some': 6299,
 'other': 4981,
 'way': 7413,
 'big': 1319,
 'files': 2822,
 'they': 6840,
 'can': 1608,
 'stuff': 6544,
 'directly': 2310,
 'from': 3006,
 'internet': 3707,
 'any': 961,
 'help': 3395,
 'would': 7611,
 'be': 1241,
 'great': 3227,
 'my': 4652,
 'prey': 5404,
 'teasing': 6741,
 'kiss': 3939,
 'done': 23

In [64]:
X_train_cv[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [74]:
np.where(X_train_np[0] !=0)

(array([ 928,  961, 1241, 1251, 1319, 1551, 1608, 1888, 1947, 2010, 2310,
        2321, 2351, 2409, 2821, 2822, 2924, 2942, 3006, 3149, 3227, 3286,
        3395, 3416, 3514, 3592, 3707, 3754, 3939, 3950, 4399, 4463, 4553,
        4652, 4953, 4981, 5009, 5034, 5404, 5687, 6012, 6038, 6299, 6303,
        6544, 6577, 6651, 6741, 6752, 6813, 6817, 6823, 6825, 6840, 6927,
        7181, 7378, 7392, 7413, 7483, 7579, 7583, 7611, 7668, 7703]),)

In [80]:
X_train_np[0][3416]

1

**Making the Machine Learning Model:**

In [81]:
from sklearn.naive_bayes import MultinomialNB

In [83]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [85]:
X_test_cv = cv.transform(x_test)

**Evaluating the model:**

In [87]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       945
           1       0.98      0.92      0.95       170

    accuracy                           0.98      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



**Let's test it out on some emails that I recived, first one is spam obviously and the second email isn't**

In [92]:
emails = ['Congratulations talha! A balance of $5000.00 is available for your AccountThis transaction may only appear on your account after validate your info.',
          'Hey Talha, can we get together to watch footbal game tomorrow?']

In [93]:
emails_count = cv.transform(emails)

In [94]:
model.predict(emails_count)

array([1, 0])

**Instead of doing all the work we did above, we can just go ahead and use sklearn pipeline too**

In [95]:
from sklearn.pipeline import Pipeline

clf = Pipeline([('vectorizer', CountVectorizer()),('classifier', MultinomialNB())])
clf.fit(X_train, y_train)

In [98]:
print(classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3880
           1       0.99      0.97      0.98       577

    accuracy                           0.99      4457
   macro avg       0.99      0.98      0.99      4457
weighted avg       0.99      0.99      0.99      4457

