In [1]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vinod.arokiya.raj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vinod.arokiya.raj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vinod.arokiya.raj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from nltk import pos_tag

In [3]:
df = pd.read_csv('spam.csv', encoding='latin-1')
dataset = list(df['v2'])
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Tokenization
tokenized_dataset = [word_tokenize(text) for text in dataset]

In [5]:
# Lowercasing
lowercased_dataset = [[word.lower() for word in text] for text in tokenized_dataset]

In [6]:
# Removing Punctuation
translator = str.maketrans('', '', string.punctuation)
no_punctuation_dataset = [[word.translate(translator) for word in text] for text in lowercased_dataset]


In [7]:
# Removing Stopwords
stop_words = set(stopwords.words('english'))
processed_dataset = [[word for word in text if word not in stop_words] for text in no_punctuation_dataset]


In [8]:
# Print processed dataset
text_data = []
for i, text in enumerate(processed_dataset):
    print(f"Processed Text {i+1}: {' '.join(text)}")
    text_data.append(' '.join(text))
    


Processed Text 1: go jurong point  crazy  available bugis n great world la e buffet  cine got amore wat 
Processed Text 2: ok lar  joking wif u oni 
Processed Text 3: free entry 2 wkly comp win fa cup final tkts 21st may 2005  text fa 87121 receive entry question  std txt rate   c apply 08452810075over18
Processed Text 4: u dun say early hor  u c already say 
Processed Text 5: nah nt think goes usf  lives around though
Processed Text 6: freemsg hey darling 3 week word back  like fun still  tb ok  xxx std chgs send  å£150 rcv
Processed Text 7: even brother like speak  treat like aids patent 
Processed Text 8: per request melle melle  oru minnaminunginte nurungu vettam   set callertune callers  press  9 copy friends callertune
Processed Text 9: winner   valued network customer selected receivea å£900 prize reward  claim call 09061701461  claim code kl341  valid 12 hours 
Processed Text 10: mobile 11 months  u r entitled update latest colour mobiles camera free  call mobile update co free

In [9]:
# Initialize CountVectorizer with desired options
vectorizer = CountVectorizer()
# Fit the vectorizer to the text data and transform the text data into BOW vectors
bow_features = vectorizer.fit_transform(text_data)


In [10]:
# Convert the BOW vectors to an array for inspection
bow_features_array = bow_features.toarray()

# Get the feature names (words) from the vectorizer
feature_names = vectorizer.get_feature_names_out()

In [11]:
# Print the BOW features and their corresponding feature names
print("BOW Features:")
print(bow_features_array)
print("\nFeature Names:")
print(feature_names)

BOW Features:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Feature Names:
['008704050406' '0089' '0121' ... 'ûïharry' 'ûò' 'ûówell']


In [12]:
labels = df['v1']

# Initialize CountVectorizer with desired options
vectorizer = CountVectorizer()

# Fit and transform the vectorizer to convert text data into BOW features
X = vectorizer.fit_transform(text_data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict on the testing data
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9739910313901345


In [13]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:")
print(cm)

Accuracy: 0.9739910313901345
Precision: 0.9744268045710979
Recall: 0.9739910313901345
F1-score: 0.9741704417761169
Confusion Matrix:
[[948  17]
 [ 12 138]]


In [14]:
corpus = text_data[:5]
corpus

['go jurong point  crazy  available bugis n great world la e buffet  cine got amore wat ',
 'ok lar  joking wif u oni ',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005  text fa 87121 receive entry question  std txt rate   c apply 08452810075over18',
 'u dun say early hor  u c already say ',
 'nah nt think goes usf  lives around though']

In [15]:
tagged_corpus = []
for sentence in corpus:
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    # Perform POS tagging
    tagged_tokens = pos_tag(tokens)
    # Append tagged tokens to tagged_corpus
    tagged_corpus.append(tagged_tokens)

# Print the tagged corpus
for i, tagged_tokens in enumerate(tagged_corpus):
    print(f"Sentence {i+1}:")
    print(tagged_tokens)
    print()

Sentence 1:
[('go', 'VB'), ('jurong', 'JJ'), ('point', 'NN'), ('crazy', 'NN'), ('available', 'JJ'), ('bugis', 'NN'), ('n', 'RB'), ('great', 'JJ'), ('world', 'NN'), ('la', 'NN'), ('e', 'VBP'), ('buffet', 'JJ'), ('cine', 'NN'), ('got', 'VBD'), ('amore', 'RB'), ('wat', 'JJ')]

Sentence 2:
[('ok', 'JJ'), ('lar', 'JJ'), ('joking', 'NN'), ('wif', 'NN'), ('u', 'JJ'), ('oni', 'NN')]

Sentence 3:
[('free', 'JJ'), ('entry', 'NN'), ('2', 'CD'), ('wkly', 'JJ'), ('comp', 'NN'), ('win', 'VBP'), ('fa', 'JJ'), ('cup', 'NN'), ('final', 'JJ'), ('tkts', 'NN'), ('21st', 'CD'), ('may', 'MD'), ('2005', 'CD'), ('text', 'NN'), ('fa', 'NN'), ('87121', 'CD'), ('receive', 'JJ'), ('entry', 'NN'), ('question', 'NN'), ('std', 'VBD'), ('txt', 'JJ'), ('rate', 'NN'), ('c', 'NNS'), ('apply', 'VBP'), ('08452810075over18', 'CD')]

Sentence 4:
[('u', 'JJ'), ('dun', 'NNS'), ('say', 'VBP'), ('early', 'JJ'), ('hor', 'NN'), ('u', 'JJ'), ('c', 'NN'), ('already', 'RB'), ('say', 'VB')]

Sentence 5:
[('nah', 'JJ'), ('nt', 'NN'), 