Importing the Dependencies

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

Data Collection & Pre-Processing

In [3]:
# Load the dataset
data = pd.read_csv('mail_data.csv')

# Split the data into features and labels
X = data['Message']
y = data['Category']

Splitting the data into training data & test data

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: Message, Length: 4457, dtype: object

Feature Extraction

In [4]:
# Convert the text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

Naive Bayes algorithm

In [5]:
class NaiveBayes:
    def __init__(self):
        self.prior = {}
        self.conditional = {}

    def fit(self, X, y, alpha=1.0):
        n_samples = X.shape[0]
        n_features = X.shape[1]
        
        # Calculate prior probabilities
        unique_classes, class_counts = np.unique(y, return_counts=True)
        for i, class_label in enumerate(unique_classes):
            self.prior[class_label] = (class_counts[i] + alpha) / (n_samples + alpha * len(unique_classes))

        # Calculate conditional probabilities
        for feature_index in range(n_features):
            feature_name = vectorizer.get_feature_names_out()[feature_index]
            self.conditional[feature_name] = {}
            for i, class_label in enumerate(unique_classes):
                class_samples = X[y == class_label]
                self.conditional[feature_name][class_label] = (
                    class_samples[:, feature_index].sum() + alpha
                ) / (class_counts[i] + alpha * n_features)

    def predict(self, X):
        y_pred = []
        for sample in X:
            probabilities = {}
            for class_label in self.prior:
                probabilities[class_label] = np.log(self.prior[class_label])
                for feature_index, feature_value in enumerate(sample):
                    if feature_index in self.conditional and feature_value > 0:
                        probabilities[class_label] += np.log(self.conditional[feature_index][class_label])
            y_pred.append(max(probabilities, key=probabilities.get))
        return y_pred


Training model

In [6]:
# Train a Naive Bayes classifier
model = NaiveBayes()
model.fit(X_train_vectorized.toarray(), y_train)

In [7]:
# Predict the labels for the test set
y_pred = model.predict(X_test_vectorized.toarray())
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8663677130044843


In [8]:
with open('test.txt', 'r') as file:
    input_mail = [file.read()]

# Convert text to feature vectors
input_data_features = vectorizer.transform(input_mail)

# Making prediction
prediction = model.predict(input_data_features)
print(prediction)

['ham']
