In [None]:
                                                    Natural Language Processing (NLP)

In [None]:
#Natural Language Processing (NLP) is a field of Artificial Intelligence (AI) that enables computers to understand, interpret, and generate human language. 
#It bridges the gap between human communication and machine intelligence.

#Why is NLP Important?
#NLP allows machines to process and analyze large amounts of text data, helping with:
1)Spam Detection – Identifying spam emails (like in your project!)
2)Chatbots & Virtual Assistants – Siri, Alexa, Google Assistant
3) Sentiment Analysis – Understanding emotions in text (e.g., positive or negative reviews)
4) Machine Translation – Google Translate
5) Speech Recognition – Converting speech to text (e.g., voice commands)
6) Text Summarization – Automatically summarizing articles

In [None]:
# dataset consists of 5,728 emails with two columns:
️1)text (string) – This contains the email content, including both the subject line and email body.
Example: "Congratulations! You have won a free iPhone. Click the link to claim now!"

2)spam (integer: 0 or 1) – This is the label indicating whether the email is spam or not.
1 → Spam email (unwanted/junk email)
0 → Ham (not spam) (genuine email)

# NLP Techniques Used in dataset:
a)Tokenization – Breaking text into words or sentences
b)Stopword Removal – Removing common words like "the," "is," "and"
c) Lemmatization – Converting words to their root form (e.g., "running" → "run")
d)Bag of Words (BoW) – Converting text into numerical features for the model
e) Machine Learning Models – Classifying emails as spam or not

#Step-by-step NLP analysis:
1)Data Cleaning & Preprocessing
2)Exploratory Data Analysis (EDA)
3)TF-IDF and Feature Extraction
4)Spam Classification Model (if needed)


In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

In [31]:
df= pd.read_csv(r"C:\Users\vaibh\Downloads\emails.csv")

In [33]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [35]:
df.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [37]:
df.shape

(5728, 2)

In [13]:
df["spam"].value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [15]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
# Initialize lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()


In [21]:
stop_words = set(stopwords.words("english"))

In [23]:
stop_words 

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [25]:
# Function to clean and preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters, numbers, punctuation
    tokens = word_tokenize(text)  # Tokenization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(tokens)

In [None]:
#preprocess_text is the function name: It takes text as input and applies text processing steps.

# Convert to Lowercase: This ensures that words like "Spam" and "spam" are treated the same.

#  Remove Special Characters, Numbers, and Punctuation-
Uses Regular Expressions (re.sub) to replace non-word characters (\W+) with a space.
Removes punctuation (!, ?, ,, .), special characters (@, #, $), and numbers (123).

#Tokenization (Splitting into Words)-Breaks the cleaned text into individual words (tokens).

#Stopword Removal & Lemmatization:
#Lemmatization: Reduces words to their root form.
Example: running → run, better → good

#Stopword Removal: Removes common words that don’t add much meaning.
Example: "This is a test" → "test" (removing "this", "is", "a")

#return ' '.join(tokens)-Converts the processed word list back into a sentence.

In [27]:
# Apply preprocessing function to the dataset
df['clean_text'] = df['text'].apply(preprocess_text)

In [29]:
df['clean_text']

0       subject naturally irresistible corporate ident...
1       subject stock trading gunslinger fanny merrill...
2       subject unbelievable new home made easy im wan...
3       subject 4 color printing special request addit...
4       subject money get software cd software compati...
                              ...                        
5723    subject research development charge gpg forwar...
5724    subject receipt visit jim thanks invitation vi...
5725    subject enron case study update wow day super ...
5726    subject interest david please call shirley cre...
5727    subject news aurora 5 2 update aurora version ...
Name: clean_text, Length: 5728, dtype: object

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
#CountVectorizer is a feature extraction method from the scikit-learn library that converts text into a numerical representation using the Bag of Words (BoW) model.
# Bag of Words (BoW) is feature extraction technique
#The Bag of Words model represents text data as a collection of word counts, ignoring grammar and word order. 
#It converts text into a matrix of token (word) frequencies.
vectorizer = CountVectorizer()

In [87]:
vectorizer

In [63]:
X = vectorizer.fit_transform(df['clean_text'])
y = df['spam']  # Assuming 'label' column contains spam (1) and non-spam (0) labels

In [89]:
X

<5728x34548 sparse matrix of type '<class 'numpy.int64'>'
	with 534648 stored elements in Compressed Sparse Row format>

In [93]:
X = vectorizer.fit_transform(df['clean_text']).toarray()

In [95]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [91]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64

In [67]:
from sklearn.model_selection import train_test_split

In [69]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
from sklearn.naive_bayes import MultinomialNB

In [75]:
# MultinomialNB() is a Naïve Bayes classifier designed for discrete data, particularly word counts in text classification.
# It assumes that features (words in text) follow a multinomial distribution.
# Works well with Bag of Words (BoW) and TF-IDF feature extraction.


# Initialize and train the Multinomial Naïve Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

In [77]:
# Predict on test set
y_pred = model.predict(X_test)

In [79]:
y_pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [83]:
from sklearn.metrics import accuracy_score, classification_report

In [85]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9912739965095986
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       856
           1       0.97      0.99      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



In [None]:
#Interpretation
For class 0 (Non-Spam):

Precision = 1.00 → All predicted non-spam emails were actually non-spam.
Recall = 0.99 → 99% of actual non-spam emails were correctly identified.
F1-score = 0.99 → Model performs excellently.
For class 1 (Spam):

Precision = 0.97 → 97% of predicted spam emails were actually spam.
Recall = 0.99 → 99% of actual spam emails were detected.
F1-score = 0.98 → High performance for spam classification.

Class 0 (Non-Spam) → Support = 856
→ There were 856 actual non-spam emails in the dataset.
Class 1 (Spam) → Support = 290
→ There were 290 actual spam emails in the dataset.
The total number of samples in the dataset = 856 + 290 = 1146 (which matches the dataset size).

# Overall Performance
Metric	Value
Accuracy	0.99 (99%)
Macro Average	0.99 (avg across both classes, equal weight)
Weighted Average	0.99 (accounts for class imbalance)
                  
Key Insights
High accuracy (99%): The model classifies most emails correctly.
Very high precision & recall: Few false positives & false negatives.
Class imbalance handled well: More non-spam emails than spam, but still performs well.

In [101]:
from sklearn.ensemble import RandomForestClassifier

In [103]:

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [105]:
# Predict using Random Forest
y_pred_rf = rf_model.predict(X_test)

In [107]:
y_pred_rf 

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [109]:
# Evaluate Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9764397905759162
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       856
           1       1.00      0.91      0.95       290

    accuracy                           0.98      1146
   macro avg       0.98      0.95      0.97      1146
weighted avg       0.98      0.98      0.98      1146



In [None]:
# RandomForestClassifier After MultinomialNB
Even if Naïve Bayes (MultinomialNB) worked well, we try Random Forest for:
1)Improved Accuracy – If MultinomialNB is not accurate enough, RandomForestClassifier might capture more complex patterns.
2)Non-Linearity Handling – Naïve Bayes assumes independence between words, while Random Forest does not.
3) Feature Importance – Random Forest can tell us which words are most important in classification.
4) Handling Non-Text Data – If we want to combine text with other numerical features (like email metadata), Random Forest is a better choice.

In [None]:
#Conclusion
Started with MultinomialNB for text-based tasks (it's simple & effective)and then tried RandomForestClassifier if Naïve Bayes doesn’t perform well.
Since Multinomial Naïve Bayes (99%) is outperforming Random Forest (97%),Hence it is the better model for this spam classification task