In [1]:
#import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB

In [2]:
#importing data
df = pd.read_csv('spam.csv', encoding = 'ISO-8859-1')

In [3]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis =1)
df = df.rename(columns={'v1' : 'label', 'v2' : 'Messages' })

In [5]:
df

Unnamed: 0,label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
df.groupby('label').describe()

Unnamed: 0_level_0,Messages,Messages,Messages,Messages
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [7]:
#creating a new column spam
df['spam'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)

In [8]:
df

Unnamed: 0,label,Messages,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [9]:
# train_test_split data
X_train, X_test, y_train, y_test = train_test_split(df.Messages, df.spam, test_size= 0.25)

In [10]:
X_train.describe()

count                       4179
unique                      3928
top       Sorry, I'll call later
freq                          23
Name: Messages, dtype: object

In [11]:
#find word count and store data as a matrix
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train.values) 
# convert a collection of text documents in X_train.values into a document-term matrix.

In [None]:
CountVectorizer is a tool used for converting text documents into a numerical representation that machine learning algorithms 
can understand. It takes a collection of text documents as input and creates a matrix where each row represents a document, 
and each column represents a unique word or term found in the documents. The values in the matrix indicate the frequency of 
occurrence of each word in each document.

In [12]:
X_train_count.toarray()
#convert the document-term matrix X_train_count into a dense NumPy array representation.

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
#train model
model = MultinomialNB()
model.fit(X_train_count, y_train)

MultinomialNB()

In [14]:
# ham test on model
email_ham = ["Hello,how are you doing?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)


array([0], dtype=int64)

In [15]:
# spam test on model
email_spam = ["Login to claim free trip"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)