In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Load the SMS spam collection dataset
data = pd.read_csv("C:\\Users\srika\Downloads\sms+spam+collection\SMSSpamCollection", sep='\t', names=['label','message'])



In [2]:
# Data cleaning and preprocessing
def preprocess_text(message):
    # Remove non-alphanumeric characters and convert to lowercase
    message = re.sub('[^a-zA-Z0-9]', ' ', message.lower())
    
    # Tokenization
    tokens = word_tokenize(message)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back to text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [3]:
data['preprocessed_text'] = data['message'].apply(preprocess_text)



In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['preprocessed_text'], data['label'], test_size=0.2, random_state=42)



In [5]:
# Convert text to numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)



In [6]:
# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)


In [7]:
# Predict on the test set
y_pred = classifier.predict(X_test)



In [8]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9874439461883409


In [9]:


# Create a sample dataset
data = pd.DataFrame({
    'message': [
        'Congratulations! You have won a free vacation. Reply to claim your prize.',
        'Hey, what are you up to tonight?',
        'URGENT: Your account has been suspended. Please click the link to reactivate.',
        'Meeting at 3 PM in the conference room.',
        'Get the latest fashion trends at discounted prices. Limited time offer!'
    ]
})

# Data cleaning and preprocessing
data['preprocessed_text'] = data['message'].apply(preprocess_text)

# Convert text to numerical features using CountVectorizer
X = vectorizer.transform(data['preprocessed_text'])

# Predict on the sample dataset
y_pred = classifier.predict(X)

# Add the predicted labels to the dataset
data['predicted_label'] = y_pred

# Display the dataset with predicted labels
print(data[['message',  'predicted_label']])








                                             message predicted_label
0  Congratulations! You have won a free vacation....            spam
1                   Hey, what are you up to tonight?             ham
2  URGENT: Your account has been suspended. Pleas...            spam
3            Meeting at 3 PM in the conference room.             ham
4  Get the latest fashion trends at discounted pr...            spam
