In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [11]:
def  DecisionTreeEmailSpamDetection():
    # Load the dataset
    data = pd.read_csv('spam_ham_dataset.csv')
    
    # Use the 'text' column for features and 'label_num' for binary labels (spam=1, ham=0)
    X = data['text']
    y = data['label_num']
    
    # Split dataset into training and testing sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert email text into TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Initialize and train the Decision Tree Classifier
    classifier = DecisionTreeClassifier(random_state=42)
    classifier.fit(X_train_vec, y_train)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test_vec)
    
    # Output evaluation results
    print("Decision Tree Classifier Results:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [12]:
if __name__ == "__main__":
    DecisionTreeEmailSpamDetection()

Decision Tree Classifier Results:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       742
           1       0.90      0.92      0.91       293

    accuracy                           0.95      1035
   macro avg       0.93      0.94      0.94      1035
weighted avg       0.95      0.95      0.95      1035

Confusion Matrix:
[[711  31]
 [ 23 270]]


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [14]:
def RandomForestEmailSpamDetection():
    # Load the dataset
    data = pd.read_csv('spam_ham_dataset.csv')
    
    # Use the 'text' column for features and 'label_num' for binary labels (spam=1, ham=0)
    X = data['text']
    y = data['label_num']
    
    # Split dataset into training and testing sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert email text into TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Initialize and train the Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_train_vec, y_train)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test_vec)
    
    # Output evaluation results
    print("Random Forest Classifier Results:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [15]:
if __name__ == "__main__":
    RandomForestEmailSpamDetection()

Random Forest Classifier Results:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       742
           1       0.95      0.98      0.97       293

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035

Confusion Matrix:
[[728  14]
 [  5 288]]


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [17]:
def LogisticRegressionEmailSpamDetection():
    # Load the dataset
    data = pd.read_csv('spam_ham_dataset.csv')
    
    # Use the 'text' column for features and 'label_num' for binary labels (spam=1, ham=0)
    X = data['text']
    y = data['label_num']
    
    # Split dataset into training and testing sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert email text into TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Initialize and train the Logistic Regression Classifier
    classifier = LogisticRegression(max_iter=1000, random_state=42)
    classifier.fit(X_train_vec, y_train)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test_vec)
    
    # Output evaluation results
    print("Logistic Regression Classifier Results:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


In [18]:

if __name__ == "__main__":
    LogisticRegressionEmailSpamDetection()


Logistic Regression Classifier Results:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       742
           1       0.98      0.99      0.98       293

    accuracy                           0.99      1035
   macro avg       0.99      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035

Confusion Matrix:
[[735   7]
 [  4 289]]
