In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os
import csv
import glob

In [2]:
folder_path = 'Spam/'
csv_files = []
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        csv_files.append(os.path.join(folder_path, file))

csv_files

['Spam/1998.csv',
 'Spam/1999.csv',
 'Spam/2000.csv',
 'Spam/2001.csv',
 'Spam/2002.csv',
 'Spam/2003.csv',
 'Spam/2008.csv']

In [3]:
spam_data = []
encodings = ["utf-8",'unicode_escape', "utf-8-sig", "latin1", "cp1252","iso-8859-1"]
encoding_dict = {}
ham = pd.read_csv('./enron.csv')
for f in csv_files:
  for encoding in encodings:
    try:
      data = pd.read_csv(f,encoding=encoding, on_bad_lines='skip')
      spam_data.append(data)
      encoding_dict[f]=encoding
      break
    except Exception as e:  
        pass
    
spam = pd.concat(spam_data)
combined_df = pd.concat([ham[['email','label']], spam[['email','label']]], axis=0, ignore_index=True)


In [4]:
#lösch alles wo label fehlt
combined_df.dropna(subset=['label'], inplace=True)


In [5]:
# Here you can View the first few rows of the combined dataframe
print(combined_df.head())

                                               email  label
0                         b'Here is our forecast   '    1.0
1  b'Traveling to have a business meeting takes t...    1.0
2                  b'test successful.  way to go!!!'    1.0
3  b'Randy,   Can you send me a schedule of the s...    1.0
4             b'Let's shoot for Tuesday at 11:45.  '    1.0


In [6]:
# count 
print(combined_df.describe())

               label
count  764057.000000
mean        0.677176
std         0.467556
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000


In [7]:
# count per label
label_counts = combined_df['label'].value_counts()
print(label_counts)

1.0    517401
0.0    246656
Name: label, dtype: int64


In [8]:
# initializing X and y
X = combined_df['email']  # Features
y = combined_df['label']  # Labels

In [12]:
# remove inconsistent rows from X and y maybe for better accuracy?
inconsistent_indices = set(X.index) - set(y.index)
X = X.drop(inconsistent_indices)
y = y.drop(inconsistent_indices)

In [13]:
# Here we're splitting data into Train-Test 80% and 20%
# stratify Aufteilung in Trainings- und Testdaten proportional zu den Originaldaten bleibt.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
# Model Training
vectorizer = TfidfVectorizer()  # Initialize the TF-IDF vectorizer

X_train_tfidf = vectorizer.fit_transform(X_train)  # Vectorize the training data

# 100 Entscheidungsbäume und tiefe von 10
# alternativ können wir mit gridsearch die besten hyperparameter bestimmen
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10)

# Train the Random Forest model
rf_model.fit(X_train_tfidf, y_train)


In [15]:
# Initialize KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_model.fit(X_train_tfidf, y_train)

In [17]:
# Here is the Model Evaluation by Vectorize the testing data
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
knn_predictions = knn_model.predict(X_test_tfidf)
rf_predictions = rf_model.predict(X_test_tfidf)

knn_accuracy = accuracy_score(y_test, knn_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

knn_precision = precision_score(y_test, knn_predictions)
rf_precision = precision_score(y_test, rf_predictions)

knn_recall = recall_score(y_test, knn_predictions)
rf_recall = recall_score(y_test, rf_predictions)

knn_f1 = f1_score(y_test, knn_predictions)
rf_f1 = f1_score(y_test, rf_predictions)


In [None]:
print("KNN Accuracy:", knn_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("KNN Precision:", knn_precision)
print("Random Forest Precision:", rf_precision)
print("KNN Recall:", knn_recall)
print("Random Forest Recall:", rf_recall)
print("KNN F1-score:", knn_f1)
print("Random Forest F1-score:", rf_f1)

In [None]:
# Here we're generating the word cloud visualization specifically for spam messages.
# The word cloud provides a visual representation of the most frequently occurring words in the spam messages, allowing you to gain insights into the content and patterns of these messages.
spam_messages = combined_df.loc[combined_df['label'] == 'spam', 'email']
spam_messages = spam_messages[spam_messages.str.strip().astype(bool)]

if len(spam_messages) > 0:
    spam_wordcloud = WordCloud(width=800, height=400).generate(' '.join(spam_messages))
    plt.figure(figsize=(10, 5))
    plt.imshow(spam_wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - Spam Messages')
    plt.show()
else:
    print("No spam messages available for word cloud generation.")


In [None]:
# Here is the Bar Plot and we have only two columns
label_counts = combined_df['label'].value_counts()
plt.figure(figsize=(8, 4))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title('Distribution of Spam and Non-Spam Messages')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()


In [None]:
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Step 4: Model Evaluation
knn_predictions = knn_model.predict(X_test_tfidf)
rf_predictions = rf_model.predict(X_test_tfidf)

knn_accuracy = accuracy_score(y_test, knn_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print("KNN Accuracy:", knn_accuracy)
print("Random Forest Accuracy:", rf_accuracy)



In [None]:
# Here we're doing Model Improvement and checking for class imbalance
label_counts = y_train.value_counts()
print("Class Distribution:")
print(label_counts)

# Check for data quality and Review the dataset and labels for any inconsistencies or errors

# Here we're performing data balancing if necessary
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)

X_train_balanced, y_train_balanced = ros.fit_resample(X_train_tfidf, y_train)  # Random Oversampling

#Here we're retraining the models with balanced data
knn_model_balanced = KNeighborsClassifier(n_neighbors=5)
knn_model_balanced.fit(X_train_balanced, y_train_balanced)

rf_model_balanced = RandomForestClassifier(n_estimators=100, max_depth=10)
rf_model_balanced.fit(X_train_balanced, y_train_balanced)



In [None]:
# Here we're Re-evaluating the models on balanced data
knn_predictions_balanced = knn_model_balanced.predict(X_test_tfidf)
rf_predictions_balanced = rf_model_balanced.predict(X_test_tfidf)

knn_accuracy_balanced = accuracy_score(y_test, knn_predictions_balanced)
rf_accuracy_balanced = accuracy_score(y_test, rf_predictions_balanced)

knn_precision_balanced, knn_recall_balanced, knn_f1_balanced, _ = classification_report(
    y_test, knn_predictions_balanced, target_names=["non-spam", "spam"], output_dict=True
)["spam"].values()

rf_precision_balanced, rf_recall_balanced, rf_f1_balanced, _ = classification_report(
    y_test, rf_predictions_balanced, target_names=["non-spam", "spam"], output_dict=True
)["spam"].values()



In [None]:
print("KNN Accuracy (Balanced):", knn_accuracy_balanced)
print("Random Forest Accuracy (Balanced):", rf_accuracy_balanced)
print("KNN Precision (Balanced):", knn_precision_balanced)
print("Random Forest Precision (Balanced):", rf_precision_balanced)
print("KNN Recall (Balanced):", knn_recall_balanced)
print("Random Forest Recall (Balanced):", rf_recall_balanced)
print("KNN F1-score (Balanced):", knn_f1_balanced)
print("Random Forest F1-score (Balanced):", rf_f1_balanced)


In [None]:
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, auc


In [None]:
# Compute ROC curve and ROC area for KNN
knn_probs = knn_model.predict_proba(X_test_tfidf)
knn_probs = knn_probs[:, 1]  # Keep probabilities of the positive class only
knn_fpr, knn_tpr, _ = roc_curve(y_test, knn_probs)
knn_roc_auc = auc(knn_fpr, knn_tpr)

# Compute ROC curve and ROC area for Random Forest
rf_probs = rf_model.predict_proba(X_test_tfidf)
rf_probs = rf_probs[:, 1]  # Keep probabilities of the positive class only
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
rf_roc_auc = auc(rf_fpr, rf_tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(knn_fpr, knn_tpr, label='KNN (AUC = %0.2f)' % knn_roc_auc)
plt.plot(rf_fpr, rf_tpr, label='Random Forest (AUC = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1], 'k--')  # Random guessing line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Compute Precision-Recall curve and area for KNN
knn_precision, knn_recall, _ = precision_recall_curve(y_test, knn_probs)
knn_pr_auc = auc(knn_recall, knn_precision)

# Compute Precision-Recall curve and area for Random Forest
rf_precision, rf_recall, _ = precision_recall_curve(y_test, rf_probs)
rf_pr_auc = auc(rf_recall, rf_precision)

# Plot Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(knn_recall, knn_precision, label='KNN (AUC = %0.2f)' % knn_pr_auc)
plt.plot(rf_recall, rf_precision, label='Random Forest (AUC = %0.2f)' % rf_pr_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower right')
plt.show()
