In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
spam_df = pd.read_csv("spam.csv")
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [20]:
spam_df.groupby( "Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [21]:
#Creating spam column
spam_df["spam"] = spam_df["Category"].apply(lambda x: 1 if x.lower() == "spam" else 0)
spam_df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [22]:
#split train,test data
#x = message,y = spam(0,1)
#we can define test , train data size
x_train,x_test,y_train,y_test = train_test_split(spam_df.Message,spam_df.spam,test_size=0.25,random_state=42)


In [23]:
x_train.describe()

count                       4179
unique                      3915
top       Sorry, I'll call later
freq                          19
Name: Message, dtype: object

In [24]:
#counting words
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4179, 7489))

In [25]:
#Model applying naive bayes of splited data(training model)
model = MultinomialNB()
model.fit(x_train_count,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [26]:
#validate
ham_email = [" hi good morning"]
ham_email_count = cv.transform(ham_email)
model.predict(ham_email_count)

array([0])

In [27]:
#validate
spam_email = ["free gift click here"]
spam_email_count = cv.transform(spam_email)
model.predict(spam_email_count)

array([1])

In [28]:
x_test_count = cv.transform(x_test)
model.score(x_test_count,y_test)

0.9885139985642498

In [29]:
x=input("Enter the message:")
x_count = cv.transform([x])
if model.predict(x_count)[0]== 0:
    print("Not Spam Mail")
else:
    print("Spam Mail")
prob = model.predict_proba(x_count)
print(prob)

Spam Mail
[[0.39787854 0.60212146]]


In [30]:
x

'free'

In [31]:
spam_df.columns

Index(['Category', 'Message', 'spam'], dtype='object')

##Pipeline Implementing##

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
# Assuming your dataset has columns: "label" (spam/ham) and "message"
data = pd.read_csv("spam.csv", encoding='latin-1')

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    data['Message'], data['Category'], test_size=0.2, random_state=42
)

# Build pipeline
spam_clf = Pipeline([
    ('vect', CountVectorizer()),          # Step 1: Convert text to word counts
    ('tfidf', TfidfTransformer()),        # Step 2: Transform counts to TF-IDF
    ('clf', MultinomialNB()),             # Step 3: Apply Naive Bayes classifier
])

# Train model
spam_clf.fit(X_train, y_train)

# Predictions
y_pred = spam_clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9632286995515695

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

