In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import logging



**Logging Setup**

In [None]:
logging.basicConfig(filename="spam_detection_debug.log", level=logging.DEBUG, format="%(asctime)s - %(message)s")


Defining Spammy Words and Characters

In [None]:
spammy_words = [
    "free", "win", "offer", "discount", "bonus", "trial", "prize",
    "exclusive", "limited time", "best deal", "hurry", "urgent",
    "act now", "last chance", "click here", "apply now", "don't miss out",
    "get it now", "money", "cash", "investment", "credit", "loan",
    "insurance", "save", "guaranteed", "income", "100%", "risk-free",
    "no fees", "winner", "no obligation", "instant", "verify", "secret",
    "unsubscribe", "password", "account", "update", "secure", "reset"
]
spammy_chars = [';', '!', '$', '%', '#', '£', '€', '₹', '@']


Loading the Dataset

In [None]:
file_path = "/content/spambase.data"
column_names = [f"feature_{i}" for i in range(1, 58)]
column_names[-1] = "spam_label"
df = pd.read_csv(file_path, header=None, names=column_names)
logging.info("Dataset loaded successfully.")


Splitting the Data

In [None]:
X = df.drop(columns=["spam_label"])
y = df["spam_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


Training the Random Forest Model

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


Feature Extraction from Raw Email

In [None]:
def extract_features_from_email(email):
    """
    Extract numerical features from a raw email for prediction.
    """
    email = email.lower()
    word_freq = [email.count(word) / (len(email.split()) or 1) for word in spammy_words]
    char_freq = [email.count(char) / (len(email) or 1) for char in spammy_chars]
    metadata = [
        len(email.split()),  # Word count
        len(email),          # Character count
        sum(1 for word in email.split() if word.isupper()),  # Count of capitalized words
        email.count("!"),    # Count of '!'
        email.count("?")     # Count of '?'
    ]
    feature_vector = word_freq + char_freq + metadata
    while len(feature_vector) < 56:
        feature_vector.append(0)
    return feature_vector[:56]

Spam Prediction Function

In [None]:
# Function to predict spam
def check_email_from_text(email, use_rf=False):
    """
    Predict if an email is spam or not based on its raw text.
    """
    features = extract_features_from_email(email)
    features_df = pd.DataFrame([features], columns=X.columns[:56])
    model_to_use = rf_model if use_rf else model
    proba = model_to_use.predict_proba(features_df)
    prediction = model_to_use.predict(features_df)
    logging.info(f"Extracted features: {features}")
    logging.info(f"Prediction probabilities: {proba}")
    return "Spam" if prediction[0] == 1 else "Not Spam"

Testing the System with User Input

In [25]:
# Test the system with a user input email
print("Enter the raw text of your email:")
email = input()
result = check_email_from_text(email, use_rf=True)
print(f"The email is predicted to be: {result}")

Enter the raw text of your email:
It is emergency. I need your help
The email is predicted to be: Spam
