In [4]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# Load the dataset
data = pd.read_csv("Spam_Email_Data.csv")

# Data Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers using regex
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# Apply preprocessing to the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['target'], test_size=0.2, random_state=42)

data.head(10)


Unnamed: 0,text,target,clean_text
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0,ilugadminlinuxie mon jul returnpath ilugadminl...
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1,gortexcitecom mon jun returnpath gortexcitecom...
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1,forkadminxentcom mon jul returnpath forkadminx...
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1,dcmbtamailnetcn mon jun returnpath dcmbtamailn...
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0,ilugadminlinuxie mon aug returnpath ilugadminl...
5,From tobaccodemon@terra.es Sat Sep 7 22:05:58 ...,1,tobaccodemonterraes sat sep returnpath tobacco...
6,From larlar78@MailOps.Com Sat Jun 30 00:19:08 ...,1,larlarmailopscom sat jun returnpath larlarmail...
7,From rpm-list-admin@freshrpms.net Thu Jul 25 1...,0,rpmlistadminfreshrpmsnet thu jul returnpath rp...
8,From exmh-users-admin@redhat.com Wed Aug 7 06:...,0,exmhusersadminredhatcom wed aug returnpath exm...
9,From contractor@goldenbay.com.cy Tue Jul 23 23...,1,contractorgoldenbaycomcy tue jul returnpath co...


In [5]:
# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}

# Define embedding techniques
embedding_techniques1 = {
    'Bag of Words': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

results = []
# Loop through classifiers and embedding techniques
for clf_name, clf in classifiers.items():
    for vectorizer_name, vectorizer in embedding_techniques1.items():
        # Vectorize the data
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)
        
        # Training
        clf.fit(X_train_vectorized, y_train)
        # Predictions on test set
        predictions = clf.predict(X_test_vectorized)
        
        # Model evaluation
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')
        
        # Store results
        results.append({
            'Model': clf_name,
            'Embedding Technique': vectorizer_name,
            'Accuracy': accuracy,
            'F1-Score': f1
        })

# Create dataframe from results
results_df = pd.DataFrame(results)

# Print summary
print(results_df)

                 Model Embedding Technique  Accuracy  F1-Score
0  Logistic Regression        Bag of Words  0.998276  0.998275
1  Logistic Regression              TF-IDF  0.988793  0.988742
2        Decision Tree        Bag of Words  0.991379  0.991373
3        Decision Tree              TF-IDF  0.986207  0.986207
