In [5]:
import pandas as pd

In [7]:
#Loading the Dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

In [9]:
# Checking the first few rows
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [11]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
# Erase the other extra columns
data = data[['v1', 'v2']]

In [15]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
import re       # regular expressions
import nltk     # Natural Languages toolkit
from nltk.corpus import stopwords #  "the", "is", "and"
from nltk.tokenize import word_tokenize 

In [18]:
# Remove unnecessary columns
#data = data[['v1', 'v2']]
data.columns = ['label', 'message']

In [19]:
# Define a function to clean the text
stop_words = set(stopwords.words('english'))

In [20]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [21]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

In [22]:
# Apply the function to the message column
data['cleaned_text'] = data['message'].apply(clean_text)

In [23]:
# Label encoding: conver the spam/labels into binary values 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

In [24]:
# Vectorize Text Data
# TF-IDF Vectorization: Covert the cleaned text into numerical features using TF-IDF.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)  # Limit to 3000 features
X = vectorizer.fit_transform(data['cleaned_text']).toarray()
y = data['label']

In [25]:
# Split the Data 
# Training and Testing Split: Split the data into training and testing sets. 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Train a Model
# Train a Naive Bayes Model: Naive Bayes is simple and effective for text classification.
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [27]:
# Model Evaluation
# Make Predictions: Predict on Test Data 
y_pred = model.predict(X_test)

In [28]:
# Evaluate the Performance: Check accuracy, confusion, matrix, and classification report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Accuracy: 0.97847533632287
Confusion Matrix:
[[964   1]
 [ 23 127]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.85      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [29]:
# Save the Model(Optional)
import joblib 

joblib.dump(model, 'spam_classifier.pkl')

['spam_classifier.pkl']

In [30]:
# Test the Model with New Data(Optional)
# Test with New Emails: Load new emails and predict whether they are spam or not 
new_email = ["Congratulations! You've won a free ticket to Bahamas!"]
new_email_cleaned = vectorizer.transform([clean_text(new_email[0])])
print(model.predict(new_email_cleaned))

[1]


In [31]:
# Another Test
new_email = ["Congratulations! You've won a billion dollar😯!"]
new_email_cleaned = vectorizer.transform([clean_text(new_email[0])])
print(model.predict(new_email_cleaned))

[1]


In [32]:
# Another Test
new_email = ["Hey, are we still on for lunch tomorrow at 1 PM? Let me know!"]
new_email_cleaned = vectorizer.transform([clean_text(new_email[0])])
print(model.predict(new_email_cleaned))

[0]
