<a href="https://colab.research.google.com/github/SaidurIUT/MachineLearning/blob/main/Spam_Mail_Detection_with_Machine_Learning_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string # Import the string module, which contains useful string manipulation functions like punctuation.
import numpy as np # Import NumPy for numerical operations and array handling.
import pandas as pd # Import pandas for data manipulation and analysis.

import nltk  # Import NLTK (Natural Language Toolkit) for text processing.
from nltk.corpus import stopwords # Import stopwords (common words like 'the', 'is', etc.) to remove them from the text.
from nltk.stem.porter import PorterStemmer  # Import PorterStemmer for stemming (reducing words to their root form).

from sklearn.feature_extraction.text import CountVectorizer # Import CountVectorizer for converting text into numerical feature vectors.

from sklearn.model_selection import train_test_split # Import train_test_split to split the dataset into training and testing sets.
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Download the 'stopwords' dataset from NLTK, if not already downloaded

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [39]:
# Load the dataset containing spam/ham messages. The dataset is expected to have a column named 'text'.

df = pd.read_csv('spam_ham_dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [7]:
# Clean the 'text' column by replacing newline characters (\r\n) with a space


df['text'] = df['text'].apply(lambda x: x.replace('\r\n' ,' '))

In [9]:
# Display basic information about the dataframe, including the column types and non-null counts

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [19]:
stemmer = PorterStemmer() # Initialize the PorterStemmer, which will be used to stem words (e.g., 'running' -> 'run')
corpus = [] # Create an empty list to hold the processed text data (corpus)

stopwords_set = set(stopwords.words('english')) # Create a set of English stopwords for faster lookup, removing words like 'the', 'is', etc.

# Loop through each row in the dataframe
for i in range(len(df)):
    text = df['text'].iloc[i].lower() # Convert the text to lowercase to standardize
    text = text.translate(str.maketrans('', '', string.punctuation)).split()  # Remove punctuation using string translation, and split the text into words
    text = [stemmer.stem(word) for word in text if word not in stopwords_set] # Stem each word and remove stopwords (like 'the', 'is', etc.)
    text = ' '.join(text)     # Join the stemmed words back into a single string
    corpus.append(text)     # Append the cleaned, processed text to the corpus list



In [21]:
df.text.iloc[0] # Show the original, unprocessed text of the first row in the dataframe

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [20]:
corpus[0] # Show the processed version of the text in the corpus for the first row

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [24]:

vectorizer = CountVectorizer() # Create a CountVectorizer object, which converts the text into numerical features (word counts)
X = vectorizer.fit_transform(corpus).toarray() # Fit the vectorizer on the corpus and transform the corpus into a numerical array (bag of words)


Y = df['label_num'] # Assuming 'label_num' is the column representing spam/ham labels (numerically encoded)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) # Split the dataset into training and testing sets (80% train, 20% test)


In [25]:
X[0]

array([1, 0, 0, ..., 0, 0, 0])

In [26]:

# Initialize the RandomForestClassifier, which builds multiple decision trees for classification

clf = RandomForestClassifier(n_jobs = -1) # n_jobs=-1 uses all available CPU cores to speed up training
clf.fit(X_train , Y_train) # Train the classifier on the training data (X_train and Y_train)

In [28]:
clf.score(X_test, Y_test) # Evaluate the model on the test data (X_test and Y_test) and print the accuracy score

0.9816425120772947

In [29]:
email_to_classify = df.text.values[10] # Let's classify an example email: we take the 10th email from the original dataframe
email_to_classify # Display the email content (before processing)


"Subject: vocable % rnd - word asceticism vcsc - brand new stock for your attention vocalscape inc - the stock symbol is : vcsc vcsc will be our top stock pick for the month of april - stock expected to bounce to 12 cents level the stock hit its all time low and will bounce back stock is going to explode in next 5 days - watch it soar watch the stock go crazy this and next week . breaking news - vocalscape inc . announces agreement to resell mix network services current price : $ 0 . 025 we expect projected speculative price in next 5 days : $ 0 . 12 we expect projected speculative price in next 15 days : $ 0 . 15 vocalscape networks inc . is building a company that ' s revolutionizing the telecommunications industry with the most affordable phone systems , hardware , online software , and rates in canada and the us . vocalscape , a company with global reach , is receiving international attention for the development of voice over ip ( voip ) application solutions , including the award 

In [34]:
email_text = email_to_classify.lower().translate(str.maketrans('','',string.punctuation)).split() # Process the email for classification: convert to lowercase, remove punctuation, and split into words

email_text = [stemmer.stem(word) for word in text if word not in stopwords_set] # Stem the words and remove stopwords, just like we did for the training data


email_text = ' '.join(email_text) # Join the processed words back into a single string

email_corpus = [email_text] # Create a small corpus with just this one processed email (needed for vectorizer transformation)

X_email =vectorizer.transform(email_corpus) # Transform the processed email into numerical features using the vectorizer (this needs to match the training vectorizer)




In [35]:
clf.predict(X_email) # Use the trained classifier to predict whether the email is spam (1) or ham (0)

array([1])

In [37]:
df.label_num.iloc[10] # Check the actual label of the email from the dataframe for comparison

1