In [12]:
# Install libraries (if not already installed)
!pip install pandas numpy

# Import libraries
import pandas as pd
from nltk.corpus import stopwords  # Import stopwords for text preprocessing
from nltk.tokenize import word_tokenize  # Import word tokenizer
from nltk.probability import LaplaceProbDist  # Import Laplace distribution for smoothing




In [13]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
  """
  Preprocesses email text for better feature extraction.

  Args:
      text (str): The email text to preprocess.

  Returns:
      list: A list of preprocessed tokens (words).
  """
  # Lowercase text
  text = text.lower()
  # Remove punctuation
  text = ''.join([char for char in text if char.isalnum() or char in ' '])
  # Remove stopwords
  stop_words = stopwords.words('english')
  text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
  # Return tokenized list
  return word_tokenize(text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
from nltk import FreqDist

class NaiveBayes:
  def __init__(self):
    self.classes = None
    self.class_priors = None
    self.feature_probs = None

  def fit(self, X, y):
    """
    Trains the Naive Bayes model on the given data.

    Args:
      X (list): A list of preprocessed email texts (list of tokens).
      y (list): A list of labels corresponding to each email text in X (spam/ham).
    """
    self.classes = list(set(y))
    self.class_priors = {cls: len(y[y == cls]) / len(y) for cls in self.classes}
    self.feature_probs = {}

    # Count occurrences of each word in each class
    for cls in self.classes:
      class_data = [data for data, label in zip(X, y) if label == cls]

      word_counts = {word: 0 for word in set(word for data in class_data for word in data)}
      for data in class_data:
        for word in data:
          word_counts[word] += 1

      # Calculate probabilities with Laplace smoothing
      self.feature_probs[cls] = LaplaceProbDist(FreqDist(word_counts))

  def predict(self, X):
    """
    Predicts the class labels (spam/ham) for a list of new emails.

    Args:
      X (list): A list of preprocessed email texts (list of tokens).

    Returns:
      list: A list of predicted class labels for each email in X.
    """
    predictions = []
    for data in X:
      class_posteriors = {cls: 0 for cls in self.classes}
      for cls in self.classes:
        class_posterior = self.class_priors[cls]
        for word in data:
          # Handle unseen words using Laplace probability
          class_posterior *= self.feature_probs[cls].prob(word)
      predictions.append(max(class_posteriors, key=class_posteriors.get))
    return predictions


In [40]:
# Load data (replace 'spam.csv' with your data path)
data = pd.read_csv('/content/spam.csv')

# Preprocess email text
data['Message'] = data['Message'].apply(preprocess_text)

# Separate features (text) and target labels (spam/ham)
X = data['Message']
y = data['Category']

# Train the Naive Bayes model
nb_model = NaiveBayes()
nb_model.fit(X, y)

# Function to predict spam for a new email
def predict_spam(email_text):
  # Preprocess new email text
  new_email_text = preprocess_text(email_text)
  # Predict using the trained model
  prediction = nb_model.predict([new_email_text])[0]
  # Print prediction (spam or ham)
  if prediction == 'spam':
    print("This email is likely spam.")
  else:
    print("This email is likely ham.")

# Example usage with a new email
new_email = "This is a FREE offer for you! Click here to win $1 million!!! urgency"
predict_spam(new_email)


This email is likely spam.
