In [8]:
# Required libraries
import numpy as np
import re
from sklearn.linear_model import LogisticRegression

# Retrieve the spam dataset
email_data = np.genfromtxt('spam-data.csv', delimiter=',', skip_header=1)
features = email_data[:, :-1]  # Input features
labels = email_data[:, -1]  # Output labels (0 for regular, 1 for spam)

# Initialize and train the logistic regression model
spam_classifier = LogisticRegression()
spam_classifier.fit(features, labels)

# Function to derive features from an email
def analyze_email(email_text):
    word_count = len(email_text.split())
    link_count = len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', email_text))
    cap_word_count = len(re.findall(r'\b[A-Z][A-Z]+\b', email_text))
    suspicious_keywords = ['win', 'prize', 'lottery', 'offer', 'discount', 'free', 'promotion', 'opportunity']
    spam_keyword_count = sum(word.lower() in email_text.lower() for word in suspicious_keywords)
    return [word_count, link_count, cap_word_count, spam_keyword_count]

# Evaluate emails for spam detection
with open('emails.txt', 'r') as file:
    emails = file.read().split('\n\n')

for content in emails:
    if content:
        features_extracted = analyze_email(content)
        prediction_result = spam_classifier.predict([features_extracted])
        if prediction_result[0] == 0:
            print(f"Content: {content[:40]}...\nResult: Regular (non-spam)\n")
        else:
            print(f"Content: {content[:40]}...\nResult: Spam\n")

# Data inspection
# Determine the significance of each feature
feature_weights = spam_classifier.coef_[0]
feature_labels = ['Word Count', 'Link Count', 'Capitalized Word Count', 'Spam Keyword Count']

# Display the feature weights
for label, weight in zip(feature_labels, feature_weights):
    print(f"{label}: {weight:.2f}")


Content: 
Subject: Claim Your Free Trial Now!
Don...
Result: Spam

Content: ----------------
Subject: Instant Weight...
Result: Regular (non-spam)

Content: ----------------
Subject: Exclusive Acce...
Result: Regular (non-spam)

Word Count: -0.07
Link Count: 0.99
Capitalized Word Count: -0.53
Spam Keyword Count: 1.19
