In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,classification_report
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
import pickle 
import seaborn as sns
import nltk
from nltk.corpus import stopwords
nltk.download('punkt_tab')
import string
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
vectorizer=TfidfVectorizer()

: 

In [None]:
df=pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'message'])
df.columns=['label', 'message']
df.info()

In [None]:
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
df.sample(5)

In [None]:
df['label_num'].value_counts()

In [None]:
plt.pie(df['label_num'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
df['length']=df['message'].apply(len)
df['num_words']=df['message'].apply(lambda x:len(nltk.word_tokenize(x)))
df['num_sent']=df['message'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['length','num_words','num_sent']].describe()

In [None]:
df[df['label_num']==0][['length','num_words','num_sent']].describe()

In [None]:
df[df['label_num']==1][['length','num_words','num_sent']].describe()

In [None]:
sns.pairplot(df,hue='label_num')

In [None]:
df.select_dtypes(include=['number']).corr()

In [None]:
def transform_sms(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text=y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text=y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
df['transformed']=df['message'].apply(transform_sms)

In [None]:
df.head()

In [None]:
spam_wc=wc.generate(df[df['label_num']==1]['transformed'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [None]:
ham_wc=wc.generate(df[df['label_num']==0]['transformed'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [None]:
X=df['message']
y=df['label_num']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
vectorizer=TfidfVectorizer()
X_train_vectorized=vectorizer.fit_transform(X_train)

In [None]:
model=BernoulliNB()
model.fit(X_train_vectorized, y_train)
X_test_vectorized=vectorizer.transform(X_test)
y_pred=model.predict(X_test_vectorized)
print(confusion_matrix(y_test,y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
model=MultinomialNB()
model.fit(X_train_vectorized, y_train)
X_test_vectorized=vectorizer.transform(X_test)
y_pred=model.predict(X_test_vectorized)
print(confusion_matrix(y_test,y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [None]:
def predict_spam_ham(text_message):
    with open('vectorizer.pkl', 'rb') as vec_file:
        loaded_vectorizer = pickle.load(vec_file)
    with open('model.pkl', 'rb') as model_file:
        loaded_model = pickle.load(model_file)
    input_vector = loaded_vectorizer.transform([text_message])  
    prediction = loaded_model.predict(input_vector)[0] 
    return "SPAM" if prediction == 1 else "HAM"

In [None]:
while True:
    user_input=input("Enter an SMS message (or type 'exit' to quit):\n")
    if user_input.lower() == 'exit':
        break
    prediction = predict_spam_ham(user_input)
    print("Prediction:", prediction)