In [20]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Select relevant columns and rename them for clarity
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
import string
from sklearn.preprocessing import LabelEncoder

# Convert labels to binary
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Clean the text
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    return text

df['message'] = df['message'].apply(preprocess_text)
df.head()


Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert messages into TF-IDF features
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['message']).toarray()
y = df['label']
print(X.shape)
print(y.shape)


(5572, 3000)
(5572,)


In [23]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.972488038277512
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1453
           1       1.00      0.79      0.88       219

    accuracy                           0.97      1672
   macro avg       0.98      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672

[[1453    0]
 [  46  173]]


In [25]:
# Sample input
new_message = "Congratulations! You've won a free vacation to the Bahamas!"

# Preprocess the new message
new_message_processed = preprocess_text(new_message)

# Transform the new message into TF-IDF features
new_message_tfidf = tfidf.transform([new_message_processed]).toarray()

# Predict using the trained model
prediction = model.predict(new_message_tfidf)

# Interpret the prediction
if prediction[0] == 0:
    print("The message is predicted as HAM (not spam).")
else:
    print("The message is predicted as SPAM.")


The message is predicted as SPAM.
