<a href="https://colab.research.google.com/github/SonNguyen0804/Email_Spam_Classifier/blob/main/Email_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

data_path = '/content/drive/MyDrive/Datasets/emails.csv'
emails = pd.read_csv(data_path)

# Display the first few rows of the dataset
emails.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


**Data preprocessing**

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

#Download NLTK Data
nltk.download('stopwords')

# Initialize stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

#Preprocessing function
def preprocess_text(text):
  #Remove HTLM tags
  text = re.sub(r'<.*?>', '', text)
  #Remove non-alphabetic characters
  text = re.sub(r'[^a-zA-Z]', ' ', text)
  #Convert to lowercase
  text = text.lower()
  #Tokenize and remove stop words
  words = text.split()
  words = [stemmer.stem(word) for word in words if word not in stop_words]
  #Join words back into a single string
  text = ' '.join(words)
  return text

#Apply preprocessing to email content
emails['text'] = emails['message'].apply(preprocess_text)

#Display preprocessed text
emails.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,file,message,text
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,messag id date mon may pdt phillip allen enron...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,messag id date fri may pdt phillip allen enron...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,messag id date wed oct pdt phillip allen enron...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,messag id date mon oct pdt phillip allen enron...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,messag id date thu aug pdt phillip allen enron...


**Create label(if does not exist)**

In [None]:
# Define a list of spam keywords
spam_keywords = ["offer", "winner", "free", "win", "prize", "money", "cash", "urgent"]

# Function to label an email as spam if it contains any spam keywords
def is_spam(text):
    for keyword in spam_keywords:
        if keyword in text.lower():
            return 1  # Spam
    return 0  # Not spam

# Apply the function to create labels
emails['spam'] = emails['message'].apply(is_spam)

# Display the first few rows with the new 'spam' column
emails.head()

Unnamed: 0,file,message,text,spam
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,messag id date mon may pdt phillip allen enron...,0
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,messag id date fri may pdt phillip allen enron...,0
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,messag id date wed oct pdt phillip allen enron...,0
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,messag id date mon oct pdt phillip allen enron...,0
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,messag id date thu aug pdt phillip allen enron...,0


**Feature Extraction**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Initial TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

#Fit and transform the email text data
X = vectorizer.fit_transform(emails['text'])

#Extract labels (assuming 'spam' column exists)
y = emails['spam']

#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Model Selection(Naive Bayes) and Training**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Initialize and train model
model = MultinomialNB()
model.fit(X_train, y_train)

#Make the prediction on the test set
y_pred = model.predict(X_test)

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.80
Precision: 0.83
Recall: 0.57
F1 Score: 0.67


**Model Selection(Logistic Regression) and Training**

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 8: Make Predictions and Evaluate the Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.95
Precision: 0.95
Recall: 0.89
F1 Score: 0.92
