In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix
from sklearn.model_selection import train_test_split


# Load SMS spam dataset
sms_data = pd.read_csv('SMSSpamCollection.csv', encoding='ISO-8859-1')
sms_data.columns = ['label', 'text']

# Get the vocabulary and its length
vocabulary = vectorizer.vocabulary_
total_symbols = len(vocabulary)

print("Total number of symbols in vocabulary:", total_symbols)

# Split the data into ham and spam dataframes
ham_data = sms_data[sms_data['label'] == 'ham']
spam_data = sms_data[sms_data['label'] == 'spam']

# Print the first few rows of each dataframe to verify
print("HAM MESSAGES:\n", ham_data.head())
print("\nSPAM MESSAGES:\n", spam_data.head())

# Split the ham_data into training and testing datasets
train_data, test_data = train_test_split(ham_data, test_size=0.4, random_state=42)

# Split the spam_data into training and testing datasets
train_data, test_data = train_test_split(spam_data, test_size=0.4, random_state=42)

# Combine the ham and spam training datasets
train_data = pd.concat([ham_data, spam_data], axis=0)

# Combine the ham and spam testing datasets
test_data = pd.concat([ham_data, spam_data], axis=0)


# Preprocess text data
sms_data['text'] = sms_data['text'].str.lower()
sms_data['text'] = sms_data['text'].str.replace('[^a-zA-Z0-9\s]', '')
sms_data['text'] = sms_data['text'].str.strip()

# Extract character n-gram features from text data
vectorizer = CountVectorizer(ngram_range=(2,4), analyzer='char')
X_train_features = vectorizer.fit_transform(train_data['text'])
X_test_features = vectorizer.transform(test_data['text'])

# Train a Naive Bayes classifier on the training set
clf = MultinomialNB()
clf.fit(X_train_features, train_data['label'])

# Extract features from test data using the same CountVectorizer object
test_counts = vectorizer.transform(test_data['text'])

# Predict labels on the test set
y_pred = clf.predict(test_counts)

# Compute the confusion matrix
confusion = confusion_matrix(test_data['label'], y_pred)

# Evaluate the performance of the classifier
accuracy = accuracy_score(test_data['label'], y_pred)
precision = precision_score(test_data['label'], y_pred, pos_label='spam')
recall = recall_score(test_data['label'], y_pred, pos_label='spam')
f1 = f1_score(test_data['label'], y_pred, pos_label='spam')

print('Confusion matrix:\n', confusion)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)


Total number of symbols in vocabulary: 59940
HAM MESSAGES:
   label                                               text
0   ham                      Ok lar... Joking wif u oni...
2   ham  U dun say so early hor... U c already then say...
3   ham  Nah I don't think he goes to usf, he lives aro...
5   ham  Even my brother is not like to speak with me. ...
6   ham  As per your request 'Melle Melle (Oru Minnamin...

SPAM MESSAGES:
    label                                               text
1   spam  Free entry in 2 a wkly comp to win FA Cup fina...
4   spam  FreeMsg Hey there darling it's been 3 week's n...
7   spam  WINNER!! As a valued network customer you have...
8   spam  Had your mobile 11 months or more? U R entitle...
10  spam  SIX chances to win CASH! From 100 to 20,000 po...


  sms_data['text'] = sms_data['text'].str.replace('[^a-zA-Z0-9\s]', '')


Confusion matrix:
 [[4805   19]
 [  28  719]]
Accuracy: 0.9915634535989948
Precision: 0.9742547425474255
Recall: 0.9625167336010709
F1-score: 0.9683501683501684
