In [40]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



# Step 2: Load the Dataset

In [29]:
df1 = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\project\\spam\\spam.csv",encoding='iso-8859-1')

# Step 3: Data cleaning & Rename

In [30]:
# Assuming the dataset has columns 'label' and 'text'
print(df1.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [34]:
df1.shape

(5572, 5)

In [33]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [36]:
df1.drop(columns=['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'],inplace=True)


In [44]:
# Rename the columns to match the expected names for the classifier
df1.rename(columns = {'v1':'label', 'v2':'text'},inplace=True)

# Step 4: Data Preprocessing

In [45]:
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

# Apply the preprocessing function to the emails
df1['text'] = df1['text'].apply(preprocess_text)


# Step 5: Feature Extraction

In [49]:
import nltk

In [50]:
!pip install nltk



In [51]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [53]:
df1['num_characters']=df1['text'].apply(len)

In [55]:
df1.head()

Unnamed: 0,label,text,num_characters
0,0,go until jurong point crazy available only in ...,102
1,0,ok lar joking wif u oni,23
2,1,free entry in a wkly comp to win fa cup final...,124
3,0,u dun say so early hor u c already then say,43
4,0,nah i dont think he goes to usf he lives aroun...,59


In [57]:
df1['num_words']=df1['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [59]:
df1.head()

Unnamed: 0,label,text,num_characters,num_words
0,0,go until jurong point crazy available only in ...,102,20
1,0,ok lar joking wif u oni,23,6
2,1,free entry in a wkly comp to win fa cup final...,124,25
3,0,u dun say so early hor u c already then say,43,11
4,0,nah i dont think he goes to usf he lives aroun...,59,13


In [46]:
# Convert labels to binary values
df1['label'] = df1['label'].map({'ham': 0, 'spam': 1})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df1['text'], df1['label'], test_size=0.2, random_state=42)

# Transform the text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



# Step 6: Model Training

In [47]:
# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)


# Step 7: Model Evaluation

In [48]:
# Predict the labels for the test set
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display the classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


Accuracy: 0.95
Confusion Matrix:
[[965   0]
 [ 54  96]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       1.00      0.64      0.78       150

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.88      1115
weighted avg       0.95      0.95      0.95      1115



# Step 8: Example Predictions

In [60]:
# Function to classify new emails
def classify_email(email):
    email = preprocess_text(email)
    email_tfidf = vectorizer.transform([email])
    prediction = model.predict(email_tfidf)
    return 'spam' if prediction[0] == 1 else 'ham'

# Example emails to classify
emails = ["Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123456 to claim now.", "Hi team, Please find the project report attached. Let me know if you have any questions."]
for email in emails:
    print(f'Email: "{email}" is classified as: {classify_email(email)}')


Email: "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123456 to claim now." is classified as: spam
Email: "Hi team, Please find the project report attached. Let me know if you have any questions." is classified as: ham
