In [50]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [51]:
# Specify the correct encoding
df = pd.read_csv('spam.csv', encoding='latin-1')

In [52]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [53]:
df.shape

(5572, 5)

In [54]:
# Rename columns
df = df.rename(columns={'v1': 'label', 'v2': 'text'})

# Remove unnamed columns
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [55]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [56]:
df.shape

(5572, 2)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [58]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

### Bag of Words

In [59]:
# Bag of Words (BoW) approach
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [60]:
# Train a classifier on Bag of Words features
classifier_bow = MultinomialNB()
classifier_bow.fit(X_train_bow, y_train)

MultinomialNB()

In [61]:
# Predict using Bag of Words features
y_pred_bow = classifier_bow.predict(X_test_bow)

In [62]:
# Calculate accuracy for Bag of Words
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("Accuracy (Bag of Words):", accuracy_bow)

Accuracy (Bag of Words): 0.9838565022421525


### Tf-Idf

In [63]:
# Tf-Idf approach
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [64]:
# Train a classifier on Tf-Idf features
classifier_tfidf = MultinomialNB()
classifier_tfidf.fit(X_train_tfidf, y_train)

MultinomialNB()

In [65]:
# Predict using Tf-Idf features
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)

In [66]:
# Calculate accuracy for Tf-Idf
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy (Tf-Idf):", accuracy_tfidf)

Accuracy (Tf-Idf): 0.9623318385650225


In [2]:
# Sample Spam-Ham dataset
emails = [
    "Buy cheap watches!",
    "Hello, how are you?",
    "Get a free gift now!",
    "Meeting at 2 PM",
    "Limited time offer, don't miss out!",
    "Reminder: Appointment tomorrow"
]
labels = [1, 0, 1, 0, 1, 0]  # 1 for spam, 0 for ham

In [3]:
# Step 1: Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.2, random_state=42)

### Bag of Words approach

In [4]:
# Bag of Words approach
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [5]:
# Train a classifier on Bag of Words features
classifier_bow = MultinomialNB()
classifier_bow.fit(X_train_bow, y_train)

MultinomialNB()

In [6]:
# Predict using Bag of Words features
y_pred_bow = classifier_bow.predict(X_test_bow)

In [7]:
# Evaluate Bag of Words classifier
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("Accuracy (Bag of Words):", accuracy_bow)

Accuracy (Bag of Words): 0.5


### # Tf-Idf approach

In [8]:
# Tf-Idf approach
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [9]:
# Train a classifier on Tf-Idf features
classifier_tfidf = MultinomialNB()
classifier_tfidf.fit(X_train_tfidf, y_train)

MultinomialNB()

In [10]:
# Predict using Tf-Idf features
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)

In [11]:
# Evaluate Tf-Idf classifier
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy (Tf-Idf):", accuracy_tfidf)

Accuracy (Tf-Idf): 0.5
