In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.metrics import confusion_matrix,classification_report
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('/content/spam_ham_dataset.csv')

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Vectorize the text data using CountVectorizer,CountVectorizer takes a corpus of text and converts it into a
#numerical representation that can be used as input to machine learning models. It works by first tokenizing
# the text into words or n-grams, and then counting the frequency of each word or n-gram in each document.
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
#print(type(X_train_vectorized))
# Train the MultinomialNB Naive Bayes classifier,This classifier is suitable for discrete data, such as word counts,
#it calculates the probability of each class given the word frequency.
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

# Evaluate the performance of the classifier
X_test_vectorized = vectorizer.transform(X_test)
y_pred=clf.predict(X_test_vectorized)
#print(type(y_pred),type(y_test.values))
#y_pred=y_pred.reshape(-1,1)

In [7]:
#Printing the confusion matrix
cm=confusion_matrix(y_pred,y_test)
print(cm)

[[729  16]
 [ 13 277]]


In [8]:
#Accuracy
a1=cm.shape
c1=0
w1=0
for i in range (a1[0]):
    for j in range(a1[1]):
        if i==j:
            c1+=cm[i,j]
        else:
            w1+=cm[i,j]
print("Correct classification ",c1,"Incorrect classification ",w1)
print("Accuracy of the classification is:",(c1/(c1+w1))*100)

Correct classification  1006 Incorrect classification  29
Accuracy of the classification is: 97.19806763285024


In [9]:
#Evaluating the model using F1 score, Precision, recall
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       745
        spam       0.95      0.96      0.95       290

    accuracy                           0.97      1035
   macro avg       0.96      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035

