In [22]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [3]:
df = pd.read_csv("SMSSpamCollection", sep='\t', names=["label", "message"])

In [5]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
stemmer = PorterStemmer()
stop_word = set(stopwords.words('english'))

def clean_text(msg):
    msg = msg.lower()
    msg = ''.join([char for char in msg if char not in string.punctuation])
    tokens = msg.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_word]
    return " ".join(tokens)

In [16]:
df['cleaned_message'] = df['message'].apply(clean_text)

print(df[['message', 'cleaned_message']].head())

                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri 2 wkli comp win fa cup final tkt 21...  
3                u dun say earli hor u c alreadi say  
4          nah dont think goe usf live around though  


In [18]:
vectorizer = TfidfVectorizer(max_features=3000)

X = vectorizer.fit_transform(df['cleaned_message']).toarray()

y = df['label'].map({'ham':0, 'spam':1}).values

print(f"Feature matrix shape: {X.shape}")

Feature matrix shape: (5572, 3000)


In [23]:
X_train, X_test,y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
model = MultinomialNB()

model.fit(X_train,y_train)
y_pred = model.predict(X_test)


print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.9811659192825112

📊 Confusion Matrix:
 [[966   0]
 [ 21 128]]

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.86      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

