In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('Extracted_Features.csv')
print(df.columns)
print(df.head())

Index(['email_len', 'num_uppercase_words', 'num_exclamations', 'num_links',
       'has_html', 'label'],
      dtype='object')
   email_len  num_uppercase_words  num_exclamations  num_links  has_html  \
0        113                    0                 0          0         0   
1         31                    0                 0          0         0   
2        157                    2                 0          0         0   
3         51                    0                 0          0         0   
4         63                    0                 0          0         0   

   label  
0      0  
1      0  
2      1  
3      0  
4      0  


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8609865470852018
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       954
           1       0.88      0.04      0.08       161

    accuracy                           0.86      1115
   macro avg       0.87      0.52      0.50      1115
weighted avg       0.86      0.86      0.80      1115



In [7]:
import re
def extract_features(email):
    features = {}

    # Length of email
    features['email_len'] = len(email)

    # Number of uppercase words
    features['num_uppercase_words'] = len(re.findall(r'\b[A-Z]{2,}\b', email))

    # Number of exclamation marks
    features['num_exclamations'] = email.count('!')

    # Number of URLs
    features['num_links'] = len(re.findall(r'http[s]?://', email))

    # Presence of HTML
    features['has_html'] = int(bool(re.search(r'<[^>]+>', email)))

    return features

In [9]:
def predict_message(text):
    features = extract_features(text)
    input_df = pd.DataFrame([features])
    prediction = model.predict(input_df)[0]
    return "Spam" if prediction == 1 else "Ham"

# Example usage
sample_msg1 = "!Congratulations! You've won a $1000 gift card. Click here to claim now rahul!!!"
sample_msg2 = "!!save big #alert!!"

print("Message 1:", predict_message(sample_msg1))
print("Message 2:", predict_message(sample_msg2))

Message 1: Spam
Message 2: Ham
