In [2]:
# Step 1: Install necessary libraries
!pip install nltk scikit-learn



In [10]:
# Step 2: Import required modules and download nltk data
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import scipy
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Step 3: Craft Spam and Ham messages
spam_messages = [
    "Congratulations!!! You've won a free cruise to the Bahamas! Click here to claim now!",
    "Urgent: Your account has been breached. Send your bank details to secure it.",
    "Get rich quick! Double your income in just one week.",
    "You're a winner! Claim your prize now.",
    "Limited-time offer! Buy one, get one free. Act fast!"
]

ham_messages = [
    "Hi, how are you? Let's catch up soon.",
    "Reminder: Meeting tomorrow at 2 PM.",
    "The weather is great today. Enjoy your day!",
    "Thank you for your email. I'll get back to you soon.",
    "Check out this interesting article I found."
]


In [7]:
# Step 4: Preprocess messages
def preprocess_text(text):
    # Text normalization (convert to lowercase)
    text = text.lower()

    # Tokenization
    words = nltk.word_tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

# Apply preprocessing to all messages
preprocessed_spam = [preprocess_text(message) for message in spam_messages]
preprocessed_ham = [preprocess_text(message) for message in ham_messages]


In [8]:
# Step 5: Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_spam = vectorizer.fit_transform(preprocessed_spam)
X_ham = vectorizer.transform(preprocessed_ham)

In [11]:
# Step 6: Optional - Train a classifier
# Concatenate spam and ham features
X = scipy.sparse.vstack([X_spam, X_ham])
y = [1] * len(spam_messages) + [0] * len(ham_messages)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [12]:
# Step 7: Test the classifier and evaluate performance
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.00
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



In [14]:
#testing
# Add your own spam and ham messages
your_own_messages = [
    "This is an interesting offer. Click to learn more!",
    "Hello, how are you today? Let's plan to meet up.",
    " Meeting postponed to 3 PM. Please update your schedule."
    # Add more messages as needed
]

# Preprocess your own messages
preprocessed_own_messages = [preprocess_text(message) for message in your_own_messages]

# Extract features using the trained vectorizer
X_own = vectorizer.transform(preprocessed_own_messages)

# Use the trained classifier to predict the labels
predictions_own = classifier.predict(X_own)

# Print the predictions for your own messages
for message, prediction in zip(your_own_messages, predictions_own):
    label = "Spam" if prediction == 1 else "Ham"
    print(f"Message: {message} \nPrediction: {label}\n")


Message: This is an interesting offer. Click to learn more! 
Prediction: Spam

Message: Hello, how are you today? Let's plan to meet up. 
Prediction: Ham

Message:  Meeting postponed to 3 PM. Please update your schedule. 
Prediction: Ham

