In [1]:
## do pip install tensorflow and pandas
from tensorflow import keras
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:
def load_data(directory, label):
    email = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory,filename)
        try:
            with open (filepath, "r", encoding="latin-1") as file:
                email.append((file.read(), label))
        except PermissionError:
            print(f"Permission denied: {filepath}")
    return email

In [3]:
ham = load_data(r"C:\Users\asus\OneDrive\Documents\SIT Notes 2025\PRG Fundamentals\Python Project\Inbox-Gaurdian\ML Data\easy_ham\easy_ham", "ham")
spam = load_data(r"C:\Users\asus\OneDrive\Documents\SIT Notes 2025\PRG Fundamentals\Python Project\Inbox-Gaurdian\ML Data\spam_2\spam_2", "spam")
overall = ham + spam

print("Total legitmate emails: ", len(ham))
print("Total spam emails: ", len(spam))
print("Total emails loaded: ", len(overall))

Total legitmate emails:  2551
Total spam emails:  1397
Total emails loaded:  3948


In [4]:
df = pd.DataFrame(overall, columns=["email", "label"])
print(df.head())

                                               email label
0  From exmh-workers-admin@redhat.com  Thu Aug 22...   ham
1  From Steve_Burt@cursor-system.com  Thu Aug 22 ...   ham
2  From timc@2ubh.com  Thu Aug 22 13:52:59 2002\n...   ham
3  From irregulars-admin@tb.tf  Thu Aug 22 14:23:...   ham
4  From exmh-users-admin@redhat.com  Thu Aug 22 1...   ham


In [5]:
X = df["email"]
Y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words="english", max_features=2500)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [6]:
n_estimators_list = [50, 100, 150, 200, 250, 300]
for estimator in n_estimators_list:
    model_test = RandomForestClassifier(n_estimators=estimator, random_state=42)
    model_test.fit(X_train_tfidf, y_train)
    y_predictions = model_test.predict(X_test_tfidf)

    print("Accuracy:", estimator, accuracy_score(y_test, y_predictions))
    print(classification_report(y_test,y_predictions))
    print("Train Accuracy:", model_test.score(X_train_tfidf, y_train))
    print("Test Accuracy:", model_test.score(X_test_tfidf, y_test))

Accuracy: 50 0.9987341772151899
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       524
        spam       1.00      1.00      1.00       266

    accuracy                           1.00       790
   macro avg       1.00      1.00      1.00       790
weighted avg       1.00      1.00      1.00       790

Train Accuracy: 1.0
Test Accuracy: 0.9987341772151899
Accuracy: 100 0.9987341772151899
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       524
        spam       1.00      1.00      1.00       266

    accuracy                           1.00       790
   macro avg       1.00      1.00      1.00       790
weighted avg       1.00      1.00      1.00       790

Train Accuracy: 1.0
Test Accuracy: 0.9987341772151899
Accuracy: 150 0.9987341772151899
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       524
        spam       1.00      1.

In [8]:
model1_test = RandomForestClassifier(n_estimators=100, random_state=42)
model1_test.fit(X_train_tfidf, y_train)
y_predictions = model1_test.predict(X_test_tfidf)

print("Accuracy:", estimator, accuracy_score(y_test, y_predictions))
print(classification_report(y_test,y_predictions))
print("Train Accuracy:", model_test.score(X_train_tfidf, y_train))
print("Test Accuracy:", model_test.score(X_test_tfidf, y_test))

Accuracy: 300 0.9987341772151899
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       524
        spam       1.00      1.00      1.00       266

    accuracy                           1.00       790
   macro avg       1.00      1.00      1.00       790
weighted avg       1.00      1.00      1.00       790

Train Accuracy: 1.0
Test Accuracy: 0.9987341772151899
