In [26]:
#Spam Classifier using Multinomial Naive Bayes

In [1]:
import numpy as np
import pandas as pd
import nltk
import string


from nltk.corpus import  stopwords
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer



from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Load Data

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=['label', 'message'])

In [4]:
df.shape

(5572, 2)

In [5]:
df['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [6]:
df.columns

Index(['label', 'message'], dtype='object')

In [7]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [8]:
# Preprocessing Function

def process(text) :
    text = text.lower()                                                                # Converting string into lower case
    text = "".join([i        for  i in text       if i not in string.punctuation  ])   # Removing Punctuations
    tokens = word_tokenize(text)                                                       # Tokenize
                                                                                       # Stop Word Removal
    tokens = [ i        for i in tokens          if i not  in  stopwords.words('english') ]

    return " ".join(tokens)

In [9]:
# Apply Preprocessing to column 'message'

df['message_clean'] =  df['message'].apply(process)

In [10]:
df.head(1)

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...


In [11]:
vector =TfidfVectorizer()
X = vector.fit_transform(df['message_clean'])    # Input

In [18]:
# Output      {'spam' : 1 ,    'ham' : 0}

Y = df['label'].map({'spam' : 1 ,    'ham' : 0})    # Output

In [19]:
# Split Data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= .10, random_state= 123 )

In [20]:
# Apply Naive Bayes

NB = MultinomialNB()
NB.fit(X_train, Y_train)

In [21]:
# Predict Outcomes of Test Data

pred = NB.predict(X_test)

In [22]:
# Evaluate Algo

accuracy_score(Y_test, pred)

0.9731182795698925

In [23]:
print("Accuracy:", accuracy_score(Y_test, pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, pred))
print("Classification Report:\n", classification_report(Y_test, pred))


Accuracy: 0.9731182795698925
Confusion Matrix:
 [[480   0]
 [ 15  63]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       480
           1       1.00      0.81      0.89        78

    accuracy                           0.97       558
   macro avg       0.98      0.90      0.94       558
weighted avg       0.97      0.97      0.97       558



In [24]:
import joblib

# Save the trained MultinomialNB model
joblib.dump(NB, 'spam_classifier_model.joblib')

# Save the TF-IDF vectorizer
joblib.dump(vector, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']

In [26]:
import joblib
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Load model and vectorizer
model = joblib.load('spam_classifier_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')

# Preprocessing function (same as training)
def process(text):
    text = text.lower()
    text = "".join([i for i in text if i not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in stopwords.words('english')]
    return " ".join(tokens)

# CLI loop
while True:
    user_input = input("Enter an SMS to classify ('exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting Spam Classifier.")
        break

    clean_text = process(user_input)
    vectorized_text = vectorizer.transform([clean_text])
    prediction = model.predict(vectorized_text)

    label = 'SPAM' if prediction[0] == 1 else 'HAM (Not Spam)'
    print(f"Prediction: {label}\n")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter an SMS to classify ('exit' to quit): GENT! We are trying to contact you. Last weekends draw shows that you won a £1000 prize GUARANTEED. Call 09064012160. Claim Code K52. Valid 12hrs only. 150ppm
Prediction: SPAM

Enter an SMS to classify ('exit' to quit): Goodo! Yes we must speak friday - egg-potato ratio for tortilla needed! 
Prediction: HAM (Not Spam)

Enter an SMS to classify ('exit' to quit): exit
Exiting Spam Classifier.
