In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score

# 1. Data Loading and Cleaning

In [2]:

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
df.head() # Check the columns

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
# Drop unnecessary columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True) 

In [6]:
# Rename columns
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.sample(5)

Unnamed: 0,target,text
2623,ham,I'm coming home 4 dinner.
3187,spam,This is the 2nd time we have tried 2 contact u...
286,ham,Ok..
3359,ham,Please attend the phone:)
3940,spam,"Free Msg: get Gnarls Barkleys \Crazy\"" rington..."


In [7]:
# Encode target variable
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [8]:
df.head() 

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [10]:
# check for duplicate values
df.duplicated().sum()

np.int64(403)

In [11]:
# Remove duplicates
df = df.drop_duplicates(keep='first')

In [12]:
df.duplicated().sum()

np.int64(0)

In [13]:
df.shape

(5169, 2)

# 2. CountVectorizer fot Text

In [14]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
# Convert text data into a bag-of-words model  
vectorizer = CountVectorizer()  

# Convert text data into a TF-IDF representation with a max of 3000 features  
tfidf = TfidfVectorizer(max_features=3000)  

In [15]:
X = tfidf.fit_transform(df['text']).toarray()
y = df['target'].values

In [16]:
X.shape

(5169, 3000)

# 3. Train-Test Split


In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)  # Divide data into 2 parts 80% for training and 20% for test

# 4. Model Training

In [18]:
model = MultinomialNB()
model.fit(X_train, y_train)

# 5. Model Evaluation

In [19]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}  [Spam detection reliability]")

Model Accuracy: 0.9729
Precision: 1.0000  [Spam detection reliability]


# 6. Prediction Function

In [20]:
def predict_message(message):
    message_vector = tfidf.transform([message])
    prediction = model.predict(message_vector)[0]
    return {
        'prediction': 'Spam' if prediction == 1 else 'Ham',
        'probability': model.predict_proba(message_vector)[0].max()  # Confidence score
    }

# Example of spam and Ham 

In [21]:

# Input: 
user_message = "Free lottery! Claim your prize now!"
prediction = predict_message(user_message)
print(prediction)  # Output: "Spam"

# Input: 
user_message = "Hi, how are you doing?"
prediction = predict_message(user_message)
print(prediction)  # Output: "Ham"

{'prediction': 'Spam', 'probability': np.float64(0.9433219659925338)}
{'prediction': 'Ham', 'probability': np.float64(0.9950478804576586)}


# 7. Interactive Prediction

In [22]:
if __name__ == "__main__":
    print("\nTry the spam detector! (Type 'quit' to exit)")
    while True:
        user_message = input("\nEnter a message: ")
        if user_message.lower() == 'quit':
            break
        result = predict_message(user_message)
        print(
            f"Prediction: {result['prediction']} | "
            f"Confidence: {result['probability']:.2%}"
        )


Try the spam detector! (Type 'quit' to exit)



Enter a message:  "Thanks for your email, I'll reply soon."


Prediction: Ham | Confidence: 94.72%



Enter a message:   "You've won $1,000,000! Claim now!"


Prediction: Spam | Confidence: 83.80%



Enter a message:  quit
