<a href="https://colab.research.google.com/github/VarunShivaram56/colab_aiml_lab/blob/main/aiml_lab_2_051124.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Step 1: Simulate Data Collection - Real-time data should be fetched using an email API
# Here, we're creating a simulated dataset manually for demonstration.
data = {
    'Email_Content': [
        "Congratulations! You've won a $1000 Walmart gift card. Click to claim!",
        "Please confirm your account information to avoid suspension.",
        "Are you free for a meeting at 10 am tomorrow?",
        "Important update regarding your bank account.",
        "Exclusive offer just for you. Claim your prize now!",
        "Here is the report from last month's analysis.",
        "Friendly reminder: Meeting rescheduled to next Monday.",
        "Urgent: Your account has been compromised. Click here to secure it."
    ],
    'Label': [1, 1, 0, 1, 1, 0, 0, 1]  # 1 = Spam, 0 = Not Spam
}

# Convert data into DataFrame
df = pd.DataFrame(data)

In [3]:
# Step 2: Text Preprocessing
# Cleaning and converting email content to lowercase
def preprocess_text(text):
    # Remove special characters and lowercase the text
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower()

df['Email_Content'] = df['Email_Content'].apply(preprocess_text)


In [4]:
# Step 3: Feature Engineering with TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
X = vectorizer.fit_transform(df['Email_Content'])
y = df['Label']

In [5]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# Step 5: Model Training with Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

In [11]:
# Step 6: Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=["Not Spam", "Spam"])
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", report)

# Optionally display most influential words for spam detection
feature_names = vectorizer.get_feature_names_out()
spam_influence = model.feature_log_prob_[1]
spam_words = sorted(zip(spam_influence, feature_names), reverse=True)[:10]

print("\nTop words associated with spam:")
for influence, word in spam_words:
    print(f"{word}: {influence:.2f}")

Model Accuracy: 0.6666666666666666

Classification Report:
               precision    recall  f1-score   support

    Not Spam       0.00      0.00      0.00         1
        Spam       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3


Top words associated with spam:
account: -3.18
urgent: -3.31
secure: -3.31
compromised: -3.31
update: -3.32
regarding: -3.32
important: -3.32
bank: -3.32
prize: -3.33
offer: -3.33


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
