In [31]:
# Importing libraries
import re
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# Download stopwords
nltk.download('stopwords')

# Read data
messages = pd.read_csv('sms+spam+collection/SMSSpamCollection', sep='\t', names=["label", "message"])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
# Initialize Porter Stemmer
ps = PorterStemmer()

# Preprocess the messages
corpus = []
for i in range(0, len(messages)):
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    
    # Convert to lowercase
    review = review.lower()
    
    # Tokenize
    review = review.split()
    
    # Apply stemming and remove stopwords
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    
    # Join the processed words
    review = ' '.join(review)
    corpus.append(review)

    # Create a Bag of Words
cv = CountVectorizer(max_features=5000)
x = cv.fit_transform(corpus).toarray()
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values


In [41]:
y

array([False, False,  True, ..., False, False, False])

In [43]:
# Assuming 'spam' is the spam class in your 'label' column
y = messages['label'].apply(lambda x: 1 if x == 'spam' else 0).values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [44]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [46]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(x_train, y_train)
y_pred = spam_detect_model.predict(x_test)

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score
con = confusion_matrix(y_test, y_pred)
con

array([[946,   9],
       [  8, 152]], dtype=int64)

In [49]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9847533632286996