In [19]:
import csv
import random

def create_spam_dataset(filename="spam_dataset_new.csv"):
    spam_messages = [
        "Win a free iPhone now! Click the link below.",
        "Congratulations! You've won a lottery. Claim your prize!",
        "Limited-time offer! Buy one, get one free.",
        "Your account needs verification. Click here to resolve the issue.",
        "Earn money from home with this simple trick!",
        "Exclusive deal! Get 50% off on all purchases today.",
        "You have an unclaimed reward waiting for you!",
        "Act now! This special offer expires soon.",
        "Urgent: Your bank account has been compromised!",
        "Get rich quick with this investment opportunity."
    ]
    
    ham_messages = [
        "Hey, are we still on for lunch tomorrow?",
        "Don't forget to submit your assignment by Monday.",
        "Can you send me the notes from the last class?",
        "Reminder: Your dentist appointment is tomorrow at 3 PM.",
        "Let's meet up this weekend for a coffee.",
        "Are you coming to the gym later?",
        "Mom wants to know if you're visiting for dinner.",
        "Check out this cool article I found!",
        "See you at the meeting at 10 AM.",
        "Your Amazon order has been shipped."
    ]
    
    data = [[random.choice(spam_messages), "spam"] if random.random() < 0.5 else [random.choice(ham_messages), "ham"] for _ in range(1000)]
    
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["message", "label"])
        writer.writerows(data)
    
    print(f"Dataset saved as {filename}")

# Call the function to create the dataset
create_spam_dataset()


Dataset saved as spam_dataset_new.csv


In [20]:
import pandas as pd
messages = pd.read_csv('spam_dataset_new.csv')
print(messages)

                                               message label
0    Don't forget to submit your assignment by Monday.   ham
1         Win a free iPhone now! Click the link below.  spam
2       Can you send me the notes from the last class?   ham
3                 Check out this cool article I found!   ham
4             Hey, are we still on for lunch tomorrow?   ham
..                                                 ...   ...
995         Limited-time offer! Buy one, get one free.  spam
996       Earn money from home with this simple trick!  spam
997   Get rich quick with this investment opportunity.  spam
998  Congratulations! You've won a lottery. Claim y...  spam
999   Get rich quick with this investment opportunity.  spam

[1000 rows x 2 columns]


In [21]:
## Data cleaning and Preprocessing

import re 
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AIO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
corpus =[]

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [58]:
corpus

['forget submit assignment monday',
 'win free iphone click link',
 'send note last class',
 'check cool article found',
 'hey still lunch tomorrow',
 'congratulation lottery claim prize',
 'amazon order shipped',
 'congratulation lottery claim prize',
 'exclusive deal get purchase today',
 'forget submit assignment monday',
 'check cool article found',
 'send note last class',
 'urgent bank account compromised',
 'congratulation lottery claim prize',
 'coming gym later',
 'limited time offer buy one get one free',
 'send note last class',
 'see meeting',
 'amazon order shipped',
 'send note last class',
 'earn money home simple trick',
 'reminder dentist appointment tomorrow pm',
 'check cool article found',
 'forget submit assignment monday',
 'check cool article found',
 'send note last class',
 'reminder dentist appointment tomorrow pm',
 'coming gym later',
 'exclusive deal get purchase today',
 'act special offer expires soon',
 'limited time offer buy one get one free',
 'remind

In [59]:
## CReate the Bag of Words model
# (from GFG)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

vectorizer.fit(corpus)

print("Vocabulary : " , vectorizer.vocabulary_)

# Encode the corpus
vector = vectorizer.transform(corpus)
print(vector.toarray())

Vocabulary :  {'forget': 23, 'submit': 67, 'assignment': 5, 'monday': 44, 'win': 79, 'free': 25, 'iphone': 31, 'click': 11, 'link': 38, 'send': 61, 'note': 47, 'last': 34, 'class': 10, 'check': 8, 'cool': 16, 'article': 4, 'found': 24, 'hey': 28, 'still': 66, 'lunch': 40, 'tomorrow': 70, 'congratulation': 15, 'lottery': 39, 'claim': 9, 'prize': 53, 'amazon': 2, 'order': 51, 'shipped': 62, 'exclusive': 21, 'deal': 17, 'get': 26, 'purchase': 54, 'today': 69, 'urgent': 73, 'bank': 6, 'account': 0, 'compromised': 14, 'coming': 13, 'gym': 27, 'later': 35, 'limited': 37, 'time': 68, 'offer': 48, 'buy': 7, 'one': 49, 'see': 60, 'meeting': 42, 'earn': 20, 'money': 45, 'home': 29, 'simple': 63, 'trick': 71, 'reminder': 56, 'dentist': 18, 'appointment': 3, 'pm': 52, 'act': 1, 'special': 65, 'expires': 22, 'soon': 64, 'rich': 59, 'quick': 55, 'investment': 30, 'opportunity': 50, 'let': 36, 'meet': 41, 'weekend': 78, 'coffee': 12, 'unclaimed': 72, 'reward': 58, 'waiting': 76, 'mom': 43, 'want': 77

In [60]:
# from course

cv = CountVectorizer(max_features=1000) # out of all these words from the vocab, take the 1000 words which are having the max frequency
X = cv.fit_transform(corpus).toarray()
X.shape

(1000, 80)

In [61]:
# for BOW enable binary = True
cv = CountVectorizer(max_features=70, binary =True) # out of all these words from the vocab, take the 70 words which are having the max frequency
X_Binary = cv.fit_transform(corpus).toarray()
X_Binary
X_Binary.shape

print(cv.vocabulary_)
print(len(cv.vocabulary_))

{'forget': 20, 'submit': 59, 'assignment': 4, 'monday': 39, 'win': 69, 'free': 22, 'iphone': 27, 'click': 10, 'link': 33, 'send': 55, 'note': 42, 'last': 30, 'class': 9, 'check': 7, 'cool': 14, 'article': 3, 'found': 21, 'hey': 24, 'still': 58, 'lunch': 35, 'tomorrow': 62, 'congratulation': 13, 'lottery': 34, 'claim': 8, 'prize': 48, 'amazon': 1, 'order': 46, 'shipped': 56, 'exclusive': 19, 'deal': 15, 'get': 23, 'purchase': 49, 'today': 61, 'urgent': 64, 'bank': 5, 'account': 0, 'compromised': 12, 'limited': 32, 'time': 60, 'offer': 43, 'buy': 6, 'one': 44, 'see': 54, 'meeting': 37, 'earn': 18, 'money': 40, 'home': 25, 'simple': 57, 'trick': 63, 'reminder': 51, 'dentist': 16, 'appointment': 2, 'pm': 47, 'rich': 53, 'quick': 50, 'investment': 26, 'opportunity': 45, 'let': 31, 'meet': 36, 'weekend': 68, 'coffee': 11, 'mom': 38, 'want': 67, 'know': 29, 'visiting': 66, 'dinner': 17, 'need': 41, 'verification': 65, 'resolve': 52, 'issue': 28}
70


### N-Grams


In [62]:
# with ngram
cv_ngrams = CountVectorizer(max_features=70, binary =True, ngram_range=(1,1))
X_ngram = cv_ngrams.fit_transform(corpus).toarray()
cv_ngrams.vocabulary_

{'forget': 20,
 'submit': 59,
 'assignment': 4,
 'monday': 39,
 'win': 69,
 'free': 22,
 'iphone': 27,
 'click': 10,
 'link': 33,
 'send': 55,
 'note': 42,
 'last': 30,
 'class': 9,
 'check': 7,
 'cool': 14,
 'article': 3,
 'found': 21,
 'hey': 24,
 'still': 58,
 'lunch': 35,
 'tomorrow': 62,
 'congratulation': 13,
 'lottery': 34,
 'claim': 8,
 'prize': 48,
 'amazon': 1,
 'order': 46,
 'shipped': 56,
 'exclusive': 19,
 'deal': 15,
 'get': 23,
 'purchase': 49,
 'today': 61,
 'urgent': 64,
 'bank': 5,
 'account': 0,
 'compromised': 12,
 'limited': 32,
 'time': 60,
 'offer': 43,
 'buy': 6,
 'one': 44,
 'see': 54,
 'meeting': 37,
 'earn': 18,
 'money': 40,
 'home': 25,
 'simple': 57,
 'trick': 63,
 'reminder': 51,
 'dentist': 16,
 'appointment': 2,
 'pm': 47,
 'rich': 53,
 'quick': 50,
 'investment': 26,
 'opportunity': 45,
 'let': 31,
 'meet': 36,
 'weekend': 68,
 'coffee': 11,
 'mom': 38,
 'want': 67,
 'know': 29,
 'visiting': 66,
 'dinner': 17,
 'need': 41,
 'verification': 65,
 'resolv

In [68]:
cv_ngrams = CountVectorizer(max_features=70, binary =True, ngram_range=(2,3))
X_ngram = cv_ngrams.fit_transform(corpus).toarray()
print(cv_ngrams.vocabulary_)

X_ngram

{'forget submit': 18, 'assignment monday': 4, 'forget submit assignment': 19, 'win free': 68, 'free iphone': 20, 'iphone click': 28, 'click link': 11, 'win free iphone': 69, 'free iphone click': 21, 'iphone click link': 29, 'send note': 56, 'note last': 43, 'last class': 32, 'send note last': 57, 'note last class': 44, 'check cool': 9, 'cool article': 12, 'article found': 3, 'check cool article': 10, 'cool article found': 13, 'urgent bank': 62, 'bank account': 5, 'account compromised': 0, 'urgent bank account': 63, 'bank account compromised': 6, 'limited time': 35, 'time offer': 59, 'offer buy': 45, 'buy one': 7, 'one get': 48, 'get one': 22, 'one free': 47, 'limited time offer': 36, 'time offer buy': 60, 'offer buy one': 46, 'buy one get': 8, 'one get one': 49, 'get one free': 23, 'earn money': 16, 'money home': 41, 'home simple': 25, 'simple trick': 58, 'earn money home': 17, 'money home simple': 42, 'home simple trick': 26, 'reminder dentist': 52, 'dentist appointment': 14, 'appoint

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0