Python Implementation of Naiye Bayes Algorithm

In [24]:
import zipfile
#Import The Data
# Replace 'your_zip_file.zip' with the name of your ZIP file
zip_file_path = '/content/archive.zip'

# Create a ZipFile object
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # You can specify the directory where you want to extract the files
    # For example, to extract to the current working directory:
    # zip_ref.extractall()

    # Or you can specify a different directory:
    extraction_path = '/content'
    zip_ref.extractall(extraction_path)

print(f'Files extracted to: {extraction_path}')


Files extracted to: /content


In [46]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
#preprocessing the data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import classification_report as cr

class TextPreprocessor:
    def __init__(self, filepath):
        self.filepath = filepath
        self.data = None
        self.labels = None
        self.texts = None
        stop_words = set(stopwords.words('english'))
        # Stopwords and stemmer initialization
        self.stop_words = set(stop_words)
        self.ps = PorterStemmer()
        self.clf = MultinomialNB()
    def basic_tokenize(self, text):
        return text.split()

    def load_data(self):
        # Load the CSV data
        data = pd.read_csv(self.filepath, encoding='latin-1')

        # Drop the unnecessary columns
        data_cleaned = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

        # Extract labels and texts
        self.labels = data_cleaned['v1'].values
        self.texts = data_cleaned['v2'].values

    def clean_text(self):
        cleaned_texts = []
        for text in self.texts:
            # Convert to lowercase
            text = text.lower()

            # Remove punctuation and special characters
            text = re.sub(r'[^\w\s]', '', text)

            # Tokenize using basic_tokenize
            tokens = self.basic_tokenize(text)

            # Remove stopwords and perform stemming
            tokens = [self.ps.stem(word) for word in tokens if word not in self.stop_words]

            # Join tokens back to a single string
            cleaned_texts.append(' '.join(tokens))

        self.texts = cleaned_texts

    def encode_labels(self):
        # Using a dictionary to map labels to their encoded values
        label_mapping = {'ham': 0, 'spam': 1}
        self.labels = [label_mapping[label] for label in self.labels]

    def tokenize_bow(self):
        vectorizer_bow = CountVectorizer()
        bow_representation = vectorizer_bow.fit_transform(self.texts)
        return bow_representation.toarray()


    def split_data(self, test_size=0.2):
        """Split the data into training and test sets."""
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.texts,
            self.labels,
            test_size=test_size,
            stratify=self.labels,
            random_state=42  # Setting a random state for reproducibility
        )
        return self.X_train, self.X_test, self.y_train, self.y_test


class TextClassifier(TextPreprocessor):
    def __init__(self, filepath):
        super().__init__(filepath)
        self.clf = MultinomialNB()
        self.vectorizer = CountVectorizer()

    def train_classifier(self):
        """Train the Multinomial Naive Bayes classifier."""
        # Convert texts to BoW representation for training
        X_train_bow = self.vectorizer.fit_transform(self.X_train)
        self.clf.fit(X_train_bow, self.y_train)

    def predict(self, texts=None):
        """Predict the labels for the provided texts."""
        if texts is None:
            texts = self.X_test

        texts_bow = self.vectorizer.transform(texts)
        return self.clf.predict(texts_bow)

    def evaluate(self):
        """Evaluate the classifier's performance."""
        y_pred = self.predict()

        # Performance metrics
        accuracy = accuracy_score(self.y_test, y_pred)
        classification_rep = cr(self.y_test, y_pred)  # Using the explicitly imported function
        conf_matrix = confusion_matrix(self.y_test, y_pred)

        return accuracy, classification_rep, conf_matrix

    def decode_labels(self, encoded_labels):
        # Reverse the original mapping
        reverse_mapping = {0: 'ham', 1: 'spam'}

        # Convert each encoded label to its original label
        decoded_labels = [reverse_mapping[label] for label in encoded_labels]

        return decoded_labels


In [88]:
# Instantiate the text classifier
classifier = TextClassifier('/content/spam.csv')
classifier.load_data()

# Preprocess the texts and encode labels
classifier.clean_text()
classifier.encode_labels()

# Split the data into training and test sets
classifier.split_data()
classifier.train_classifier()
accuracy, classification_rep, conf_matrix = classifier.evaluate()

# Print the evaluation results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 98.03%

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.97      0.88      0.92       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
 [[962   4]
 [ 18 131]]


In [91]:
# Predict labels for the first 20 test messages
#sample_texts_1 = classifier.X_test[:10]  [uncomment this if you want predictions on x_test]

#comment random_test_sentences and sample_texts if you uncommenting the above one
random_test_sentences = [
    "You've won a million dollars! Click the link to claim your prize now!",
    "Hi there, this is a friendly reminder about our meeting tomorrow at 3 PM.",
    "Your Amazon order #123456 is on its way. Track your shipment here: amazon.com/track",
    "URGENT: Your bank account has been locked. Please call our customer service to resolve this issue.",
    "Congratulations! You are the 100,000th visitor to our website. Claim your free gift!",
    "Your Apple ID has been compromised. Click this link to verify your account.",
    "Dear customer, your subscription will expire in 3 days. Renew now to continue using our service.",
    "You've been selected for a job interview. Please confirm your availability.",
    "Get a free iPhone by participating in our survey!"
]
sample_texts = random_test_sentences
predicted_labels = classifier.predict(sample_texts)

# Decode the predicted labels (using the decode_labels function from your previous code)
decoded_sample = classifier.decode_labels(predicted_labels)

# Print the samples, predicted labels, and decoded samples together
for i in range(len(sample_texts)):
    print("\nSample Text:", sample_texts[i])
    print("Predicted Label (Decoded):", decoded_sample[i])



Sample Text: You've won a million dollars! Click the link to claim your prize now!
Predicted Label (Decoded): spam

Sample Text: Hi there, this is a friendly reminder about our meeting tomorrow at 3 PM.
Predicted Label (Decoded): ham

Sample Text: Your Amazon order #123456 is on its way. Track your shipment here: amazon.com/track
Predicted Label (Decoded): ham

Sample Text: URGENT: Your bank account has been locked. Please call our customer service to resolve this issue.
Predicted Label (Decoded): spam

Sample Text: Congratulations! You are the 100,000th visitor to our website. Claim your free gift!
Predicted Label (Decoded): spam

Sample Text: Your Apple ID has been compromised. Click this link to verify your account.
Predicted Label (Decoded): ham

Sample Text: Dear customer, your subscription will expire in 3 days. Renew now to continue using our service.
Predicted Label (Decoded): ham

Sample Text: You've been selected for a job interview. Please confirm your availability.
Predict