In [1]:
#Step 1: Import Required libraries

# Data handling
import pandas as pd

# Train-test split
from sklearn.model_selection import train_test_split

# Text to number conversion
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning model
from sklearn.naive_bayes import MultinomialNB

# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
#TF-IDF ->  Connverts SMS text to numbers
#Naive Bayes -> very good for spam detection

In [7]:
#Step 2: Load Dataset

# Load dataset
sms_data = pd.read_csv(
    r'C:\Users\DeLL\Downloads\archive (2)\spam.csv',
    encoding='latin-1'
)

# Keep only required columns
sms_data = sms_data[['v1', 'v2']]
sms_data.columns = ['label', 'message']

sms_data.head()



Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#Dataset meaning:
#label ||	message
#spam->	Win a free prize
#ham->	Hi, how are you

In [9]:
#Step 3:Clean the Data

# Convert labels to numeric form
# spam â†’ 1, ham â†’ 0
sms_data['label'] = sms_data['label'].map({'spam': 1, 'ham': 0})

print(sms_data['label'].value_counts())


label
0    4825
1     747
Name: count, dtype: int64


In [10]:
# Machine understands numbers,not words
#1 =spam, 0 =Not Spam

In [11]:
#Step 4: Split input & output

# Input text messages
X = sms_data['message']

# Output labels
y = sms_data['label']


In [12]:
#Step 5 : Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42
)

print("Training messages:", len(X_train))
print("Testing messages:", len(X_test))



Training messages: 4179
Testing messages: 1393


In [13]:
#Explanation:

#75% data â†’ learning

#25% data â†’ testing

In [14]:
#Step 6: Convert Text to numbers (TF-IDF)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.9
)

# Transform text into numerical form
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [15]:
#Removes useless words like is, the, and

#Keeps important words like win, free, prize

In [16]:
#Step 7: Train the model

# Create Naive Bayes model
spam_model = MultinomialNB()

# Train model
spam_model.fit(X_train_tfidf, y_train)

print("Model training completed âœ…")


Model training completed âœ…


In [17]:
#Step 8: evaluate the model

# Predict on test data
predictions = spam_model.predict(X_test_tfidf)

# Accuracy
print("Accuracy:", accuracy_score(y_test, predictions))

# Detailed report
print("\nClassification Report:\n")
print(classification_report(y_test, predictions))

# Confusion matrix
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, predictions))


Accuracy: 0.9662598707824839

Classification Report:

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1202
           1       1.00      0.75      0.86       191

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.96      1393

Confusion Matrix:

[[1202    0]
 [  47  144]]


In [18]:
#Accuracy usually 95%+
#Confusion matrix shows:
#Correct spam
#Correct ham
#Wrong predictions

In [19]:
#Step 9:Test with new SMS

new_sms = [
    "Congratulations! You have won a free mobile recharge"
]

new_sms_tfidf = vectorizer.transform(new_sms)
result = spam_model.predict(new_sms_tfidf)

if result[0] == 1:
    print("ðŸš« SPAM message")
else:
    print("âœ… NOT SPAM")


ðŸš« SPAM message
