## Spam Email Classfier
**Feature Extraction**
- Used TfidfVectorizer to convert the text data into TF-IDF feature vectors for model training.
  
**Split the Dataset**
- Split the data into training and testing sets using train_test_split() from sklearn, specifying a test size of 20%.

**Train the Classifier**
- Initialized and fit a DecisionTreeClassifier to the training data.


In [4]:
import os
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Preprocess the text
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Update column names
df['processed_content'] = df['v2'].apply(preprocess_text)
y = df['v1'].map({'spam': 1, 'ham': 0})

# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_content'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)
y_pred_prob = classifier.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f'AUC: {roc_auc:.2f}')

output_dir = 'task1_results'
os.makedirs(output_dir, exist_ok=True)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
plt.close()  # Close the plot to avoid displaying it inline

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.grid()
plt.savefig(os.path.join(output_dir, 'roc_curve.png'))
plt.close()  # Close the plot to avoid displaying it inline

with open(os.path.join(output_dir, 'classification_report.txt'), 'w') as f:
    f.write(f'Accuracy: {accuracy:.2f}\n')
    f.write(classification_report(y_test, y_pred))

print("Results exported to the 'task1_results' folder.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sachc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sachc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.89      0.87      0.88       150

    accuracy                           0.97      1115
   macro avg       0.93      0.93      0.93      1115
weighted avg       0.97      0.97      0.97      1115

AUC: 0.93
Results exported to the 'task1_results' folder.


## TEST and PREDICT

In [5]:
def predict_spam(text, model, vectorizer):
    # Preprocess the input text
    processed_text = preprocess_text(text)
    # Transform the text into the same feature space as the training data
    text_vector = vectorizer.transform([processed_text])
    # Make prediction
    prediction = model.predict(text_vector)
    return 'spam' if prediction[0] == 1 else 'ham'


In [8]:
ham_text = "We hope this message finds you well! We are excited to share this month’s newsletter filled with eco-friendly tips and updates. This edition features: Sustainable Living Tips: Simple ways to reduce your carbon footprint. Upcoming Events: Join us for our community cleanup on October 30th. Featured Product: Discover our new line of biodegradable cleaning supplies! Thank you for being a valued member of the EcoLiving community."
spam_text = "Congratulations! You have been randomly selected to receive a $1,000 Gift Card for your favorite store! This is an exclusive offer just for you. To claim your prize, simply click the link below and provide your details: Claim Your Prize Now! Hurry, this offer is valid for a limited time only! Don’t miss out on your chance to enjoy free shopping."
result = predict_spam(spam_text, classifier, vectorizer)
print(f'The message is classified as: {result}')


The message is classified as: spam
