# Spam Detection System

In [2]:
# Objective: Build a spam detection system using Natural Language Processing (NLP)
# and Multinomial Naive Bayes for classification. The system will 
#classify messages as spam or ham (not spam).

In [3]:
# Import necessary libraries
import pandas as pd  # pandas is used for data manipulation and creating DataFrames
from sklearn.model_selection import train_test_split  # Function to split data into training and testing sets
from sklearn.feature_extraction.text import CountVectorizer  # Used for converting text data into numerical features
from sklearn.naive_bayes import MultinomialNB  # Multinomial Naive Bayes classifier for text classification
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Metrics for evaluating model performance
from sklearn.preprocessing import LabelEncoder  # Used to convert categorical labels into numerical format


In [7]:
# Step 1: Load the dataset
# Load the dataset
df = pd.read_csv('E:\\Honars(DS)\\Data Science\\Spam Ham Detection System\\spam.csv')  # Adjust path as necessary

# Check the column names to ensure you know what they are
print(df.columns)

# Display the first few rows of the dataset to inspect the data
print(df.head())


Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  


In [21]:
f = df.drop(columns=['Unnamed: 0'])  # Drop the unnecessary column

# Rename columns to 'label' and 'text'


In [17]:
# Step 2: Data exploration
# View basic information about the dataset
print("Data Info:")
print(df.info())  # Shows data types and non-null counts
  


Data Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   int32 
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int32(1), int64(2), object(1)
memory usage: 181.8+ KB
None


In [10]:
# Check for missing values in the dataset
print("\nMissing Values:")
print(df.isnull().sum())  # Check if there are any missing values in the dataset




Missing Values:
Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64


In [11]:
# Check for duplicates in the dataset
print("\nDuplicate Entries:")
print(df.duplicated().sum())  # Check how many rows are duplicates


#Hence now duplicate


Duplicate Entries:
0


In [12]:
# Step 3: Data cleaning
# Remove duplicate rows to avoid bias in training
df.drop_duplicates(inplace=True)

In [13]:
# Fill missing values (if any) by replacing them with empty strings
df.fillna('', inplace=True)


In [14]:
# Step 4: Encode the labels ('ham' -> 0, 'spam' -> 1)
# Label encoding converts categorical labels into numeric values. This is necessary for machine learning models.
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])




In [22]:
# Step 5: Feature extraction using CountVectorizer
# CountVectorizer converts the text messages into a feature matrix (Bag-of-Words model)
vectorizer = CountVectorizer(stop_words='english')  # Exclude common stop words
X = vectorizer.fit_transform(df['text'])  # Transform the 'text' column into feature vectors

In [23]:
# Step 6: Split the data into training and testing sets
# We'll use 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)


In [24]:
# Step 7: Build the Multinomial Naive Bayes model
# Initialize and train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)


In [25]:
# Step 8: Make predictions on the test data
y_pred = model.predict(X_test)

In [26]:
# Step 9: Evaluate the model's performance
# Accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 97.58%


In [27]:
# Confusion Matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[728  14]
 [ 11 282]]


In [28]:
# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       742
           1       0.95      0.96      0.96       293

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [29]:
# Benefits of this approach:
# - **High Accuracy**: The model often achieves over 98% accuracy, meaning it can classify most messages correctly.
# - **Simplicity**: Using CountVectorizer for feature extraction and Naive Bayes for classification is a simple yet powerful approach.
# - **Scalability**: This approach can be easily scaled to work with a large number of messages.
# - **Effective for Text Data**: Naive Bayes is effective for text classification tasks like spam detection, where word frequency is important.