In [1]:
import pickle
import os
from flask import Flask, request, jsonify
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# Labels 'spam' or 'ham' (not spam)
data = [
    ("Get free money now", 'spam'),
    ("Click this link to win a prize", 'spam'),
    ("Free Course", 'spam'),
    ("Your device has been logged in on another device", 'ham'),
    ("Buy cheap Vbucks online from my site", 'spam'),
    ("Join my Masterclass", 'ham'),
    ("Hello, how are you today?", 'ham'),
    ("Meeting scheduled for tomorrow at 10 AM", 'ham'),
    ("I visited Switzerland. It was fun", 'ham'),
    ("Please find the attached report document", 'ham'),
    ("What's for dinner tonight?", 'ham'),
    ("Your project update is due Friday", 'ham'),
    ("Exclusive offer:50% OFF!", 'spam'),
    ("Festive season offers. Discount on our website", 'spam'),
    ("CHEAP CLOTHES AVAILABLE AT OUR STORE", 'spam'),
    ("Reminder about the meeting scheduled for tomorrow at 10 AM", 'ham'),
    ("Join my workshop/masterclass", 'ham'),
    ("Exclusive offer to buy a bugatti for free", 'spam'),
    ("Hot Single Moms in your Area", 'spam'),
    ("Free Prize Won Lottery Cas Money Bonus Congratulations $$$", 'spam'),
    ("Urgent Act now Limited time Expires Account suspended Warning Security alert", 'ham')
]
X_train = [message for message, label in data]
y_train = [label for message, label in data]
print(f"Loaded {len(X_train)} training examples.")
# We will create a 'pipeline' by combining the vectorizing (TfidfVectorizer) and the classification (MultinomialNB) into a single object.
# This is a best practice as it 'supposedly' prevents data leakage and simplifies the process.

print("Creating processing pipeline...")
# 1. TfidfVectorizer(): This step analyzes the words in your training data
#    - It learns the vocabulary and calculates the "importance" of each word (TF-IDF score)
# 2. MultinomialNB(): This is the classifier.
#    - It learns the probability of words appearing in 'spam' vs. 'ham' messages.
spam_classifier_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])
# Train the model by 'fitting' the pipeline to our training data.
print("Training the model on sample data...")
spam_classifier_pipeline.fit(X_train, y_train)
print("Model training complete.")
print("\n--- Testing the Model ---")

# Now, let's test the trained model with new, unseen messages.
test_messages = [
    "Congratulations! You've won a $1,000,000 lottery! CLick here",
    "Internship opportunites for undergraduates",
    "Click here for a free iPhone",
    "Just a reminder about our meetup at the complex"
]

# The .predict() method runs the new data through the entire pipeline:
# Outputs the predicted label ('spam' or 'ham') by first vectorizing and then classifying the sentences
predictions = spam_classifier_pipeline.predict(test_messages)

# Display the results
for message, prediction in zip(test_messages, predictions):
    print(f"\nMessage: \"{message}\"")
    print(f"Prediction: -> {prediction.upper()} <-")

# We are also showing the confidence score (how much is our message ham or spam)
try:
    probabilities = spam_classifier_pipeline.predict_proba(test_messages)
    print("\n--- Prediction Probabilities (ham, spam) ---")
    for msg, prob in zip(test_messages, probabilities):
        print(f"\"{msg[:30]}...\" -> Ham: {prob[0]:.4f}, Spam: {prob[1]:.4f}")
except AttributeError:
    print("\n(Classifier does not support predict_proba)")

def handle_new_comment(comment_text):
     # This is where your website would use the model
     prediction = spam_classifier_pipeline.predict([comment_text])[0]

     if prediction == 'spam':
         print("Action: Flag comment as spam or hold for moderation by the mods.")
         comment_status = 'moderation'
     else:
         print("Action: Approve comment and get it ready to be posted.")
         comment_status = 'approved'

handle_new_comment("This is a great article, thanks!")
handle_new_comment("FREE MONEY CLICK NOW!!! www.scammer.com")

pipeline_filename = 'text_model_pipeline.pkl'

with open(pipeline_filename, 'wb') as file:
    pickle.dump(spam_classifier_pipeline, file)

print(f"✅ Full Pipeline saved as: '{pipeline_filename}'")



Loaded 21 training examples.
Creating processing pipeline...
Training the model on sample data...
Model training complete.

--- Testing the Model ---

Message: "Congratulations! You've won a $1,000,000 lottery! CLick here"
Prediction: -> SPAM <-

Message: "Internship opportunites for undergraduates"
Prediction: -> HAM <-

Message: "Click here for a free iPhone"
Prediction: -> SPAM <-

Message: "Just a reminder about our meetup at the complex"
Prediction: -> HAM <-

--- Prediction Probabilities (ham, spam) ---
"Congratulations! You've won a ..." -> Ham: 0.4039, Spam: 0.5961
"Internship opportunites for un..." -> Ham: 0.6151, Spam: 0.3849
"Click here for a free iPhone..." -> Ham: 0.3820, Spam: 0.6180
"Just a reminder about our meet..." -> Ham: 0.5972, Spam: 0.4028
Action: Approve comment and get it ready to be posted.
Action: Flag comment as spam or hold for moderation by the mods.
✅ Full Pipeline saved as: 'text_model_pipeline.pkl'
