# Email Spam Classification

In this notebook, we will:
1. Load and preprocess the dataset.
2. Transform the text data using TF-IDF vectorization.
3. Train and evaluate different machine learning models (Naive Bayes, SVM, and Neural Network).
4. Visualize the results.

In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


## Load Dataset

In [97]:
# Load the dataset
data = pd.read_csv('combined_data.csv')
data = data.where((pd.notnull(data)), '')
data.head()



Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [98]:
label_counts = data['label'].value_counts()
print(f"Number of rows with label 0: {label_counts[0]}")
print(f"Number of rows with label 1: {label_counts[1]}")

Number of rows with label 0: 39538
Number of rows with label 1: 43910


In [99]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


## Preprocess Dataset

* 1 - Spam
* 0 - Ham

In [100]:

data.loc[data['label'] == '1', 'label',] = 1
data.loc[data['label'] == '0', 'label',] = 0
data['text'] = data['text'].apply(lambda x : x.replace('\n\r', ' '))

X = data['text']
Y = data['label']
# st = PorterStemmer()
# corp = []
# stopwords_list = set(stopwords.words('english'))
# print(stopwords_list)
# for i in range(len(data)):
#     text = data['text'].iloc[i].lower()
#     text = text.translate(str.maketrans('', '', string.punctuation)).split()
#     text = [st.stem(word) for word in text if word not in stopwords]
#     text = ' '.join(text)
#     corp.append(text)
# print(corp)
    

* Delimo X i Y na podatke za treniranje modela i za podatke za testiranje modela.
* Odabrali smo manji procenat test skupa zbog inicijalno velikog skupa podataka.
* Random state za sada ne postavljamo kako bismo testirali razlicite rezultate.

In [101]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

## Vectorize


In [102]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase=True)

X_train = feature_extraction.fit_transform(X_train)
X_test = feature_extraction.transform(X_test)
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

## Split the Data

In [103]:
# Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

## Transform the Text Data

In [104]:
# Transform the text data to TF-IDF features
# vectorizer = TfidfVectorizer()
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)

## Train and Evaluate Models
-------------------
### Naive Bias

* Traning the model with dataset for training


In [105]:
nb_model = MultinomialNB()

nb_model.fit(X_train, Y_train)


# Function to train and evaluate a model
# def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     return accuracy, precision, recall, f1

In [107]:
prediction_train = nb_model.predict(X_train)
accuracy_train = accuracy_score(Y_train, prediction_train)
print(f"Accuracy on training data: " + str(accuracy_train))

Accuracy on training data: 0.9824773977071488


In [108]:
prediction_test = nb_model.predict(X_test)
accuracy_test = accuracy_score(Y_test, prediction_test)
print(f"Accuracy on test data: " + str(accuracy_test))

# Initialize models
# nb_model = MultinomialNB()
# svm_model = SVC()
# nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)

# Train and evaluate Naive Bayes model
# nb_metrics = train_and_evaluate_model(nb_model, X_train_tfidf, X_test_tfidf, y_train, y_test)
# print(f"Naive Bayes - Accuracy: {nb_metrics[0]}, Precision: {nb_metrics[1]}, Recall: {nb_metrics[2]}, F1: {nb_metrics[3]}")
# 
# # Train and evaluate SVM model
# svm_metrics = train_and_evaluate_model(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)
# print(f"SVM - Accuracy: {svm_metrics[0]}, Precision: {svm_metrics[1]}, Recall: {svm_metrics[2]}, F1: {svm_metrics[3]}")
# 
# # Train and evaluate Neural Network model
# nn_metrics = train_and_evaluate_model(nn_model, X_train_tfidf, X_test_tfidf, y_train, y_test)
# print(f"Neural Network - Accuracy: {nn_metrics[0]}, Precision: {nn_metrics[1]}, Recall: {nn_metrics[2]}, F1: {nn_metrics[3]}")

Accuracy on test data: 0.9750748951467945


## Visualize the Results

In [None]:
# # Create a DataFrame for the metrics
# metrics_df = pd.DataFrame({
#     'Model': ['Naive Bayes'],
#     'Accuracy': [nb_metrics[0]],
#     'Precision': [nb_metrics[1]],
#     'Recall': [nb_metrics[2]],
#     'F1 Score': [nb_metrics[3]]
# })
# metrics_df = pd.DataFrame({
#     'Model': ['Naive Bayes', 'SVM', 'Neural Network'],
#     'Accuracy': [nb_metrics[0], svm_metrics[0], nn_metrics[0]],
#     'Precision': [nb_metrics[1], svm_metrics[1], nn_metrics[1]],
#     'Recall': [nb_metrics[2], svm_metrics[2], nn_metrics[2]],
#     'F1 Score': [nb_metrics[3], svm_metrics[3], nn_metrics[3]]
# })
# 
# Plot the metrics
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Model', y='Accuracy', data=metrics_df)
# plt.title('Accuracy of Models')
# plt.show()
# 
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Model', y='Precision', data=metrics_df)
# plt.title('Precision of Models')
# plt.show()
# 
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Model', y='Recall', data=metrics_df)
# plt.title('Recall of Models')
# plt.show()
# 
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Model', y='F1 Score', data=metrics_df)
# plt.title('F1 Score of Models')
# plt.show()