In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [67]:
data = pd.read_csv('spam.csv',encoding = 'latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [68]:
data = data.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
data.columns = ["label", "text"]
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [69]:
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def clean_text(text):
    # Remove punctuation from the text
    no_punctuation = [ch for ch in text if ch not in string.punctuation]

    # Combine characters back into a string and convert to lowercase
    no_punctuation = ''.join(no_punctuation).lower()
    
    # Remove stopwords and non-alphabetic words
    clean_words = [word for word in no_punctuation.split() if word not in ENGLISH_STOP_WORDS and word.isalpha()]
    
    return clean_words



data["text"] = data["text"].apply(clean_text)

# Conver the SMS into string from list
data["text"] = data["text"].agg(lambda x: ' '.join(map(str, x)))
data.head()

Unnamed: 0,label,text
0,ham,jurong point crazy available bugis n great wor...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts tex...
3,ham,u dun say early hor u c say
4,ham,nah dont think goes usf lives


In [70]:
#Creating bag of words
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(data['text'])

#We can now utilize the .transform method on our Bag-of-Words (BoW) transformer to convert the whole DataFrame of messages.
data_bow = bow_transformer.transform(data['text'])
tfidf_transformer = TfidfTransformer().fit(data_bow)

Using TF-IDF

In [71]:

from sklearn.feature_extraction.text import TfidfVectorizer

#Weighting and Normalization can be done with TF-IDF.
data_tfidf = tfidf_transformer.transform(data_bow)

#Convert cleaned text to a machine-readable format using TF-IDF.
vec = TfidfVectorizer(encoding = "latin-1", strip_accents = "unicode", stop_words = "english")
features = vec.fit_transform(data["text"])
print(features.shape)

print(len(vec.vocabulary_))

(5572, 7939)
7939


Train-Test Split

In [72]:
msg_train, msg_test, label_train, label_test = \
train_test_split(data_tfidf, data['label'], test_size=0.2)

 Naive Bayes classifier Model

In [73]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
spam_detect_model = clf.fit(msg_train, label_train)

predict_train = spam_detect_model.predict(msg_train)

In [74]:
print("Confusion Matrix \n",metrics.confusion_matrix(label_train, predict_train))
print("\n")
print("Accuracy of Train dataset : {0:0.3f}".format(metrics.accuracy_score(label_train, predict_train)))

Confusion Matrix 
 [[3857    0]
 [ 109  491]]


Accuracy of Train dataset : 0.976


Model Testing

In [75]:
label_predictions = spam_detect_model.predict(msg_test)

print('Confusion Matrix \n ',metrics.confusion_matrix(label_test, label_predictions))
print('\n')
print("Accuracy of the model : {0:0.3f}".format(metrics.accuracy_score(label_test, label_predictions)))

Confusion Matrix 
  [[967   1]
 [ 33 114]]


Accuracy of the model : 0.970
