In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Load the SMS spam dataset
sms_data = pd.read_csv('SMSSpamcollection.csv', encoding='latin-1')
sms_data.columns = ['label', 'text']

# Split the data into ham and spam dataframes
ham_data = sms_data[sms_data['label'] == 'ham']
spam_data = sms_data[sms_data['label'] == 'spam']


# Print the first few rows of each dataframe to verify
print("HAM MESSAGES:\n", ham_data.head())
print("\nSPAM MESSAGES:\n", spam_data.head())

# Split the ham_data into training and testing datasets
train_data, test_data = train_test_split(ham_data, test_size=0.4, random_state=42)

# Split the ham_data into training and testing datasets
train_data, test_data = train_test_split(spam_data, test_size=0.4, random_state=42)

# Combine the ham and spam training datasets
train_data = pd.concat([ham_data, spam_data], axis=0)

# Combine the ham and spam testing datasets
test_data = pd.concat([ham_data, spam_data], axis=0)

# Print the number of rows in each dataset
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)


# Create character n-grams using CountVectorizer
cv = CountVectorizer(ngram_range=(1, 3), analyzer='char')
train_counts = cv.fit_transform(train_data['text'])
test_counts = cv.transform(test_data['text'])

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(train_counts, train_data['label'])

# Evaluate the classifier on the testing set
predictions = clf.predict(test_counts)
accuracy = accuracy_score(test_data['label'], predictions)
confusion = confusion_matrix(test_data['label'], predictions)

print("Accuracy:", accuracy)
print("Confusion matrix:")
print(confusion)

# Load the predicted labels and true labels
y_pred = ['spam', 'ham', 'spam', 'spam', 'ham']
y_true = ['spam', 'spam', 'ham', 'ham', 'ham']

# Calculate the accuracy score
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy score:", accuracy)

# Calculate the precision score
precision = precision_score(y_true, y_pred, pos_label='spam')
print("Precision score:", precision)

# Calculate the recall score
recall = recall_score(y_true, y_pred, pos_label='spam')
print("Recall score:", recall)

# Calculate the F1 score
f1 = f1_score(y_true, y_pred, pos_label='spam')
print("F1 score:", f1)


HAM MESSAGES:
   label                                               text
0   ham                      Ok lar... Joking wif u oni...
2   ham  U dun say so early hor... U c already then say...
3   ham  Nah I don't think he goes to usf, he lives aro...
5   ham  Even my brother is not like to speak with me. ...
6   ham  As per your request 'Melle Melle (Oru Minnamin...

SPAM MESSAGES:
    label                                               text
1   spam  Free entry in 2 a wkly comp to win FA Cup fina...
4   spam  FreeMsg Hey there darling it's been 3 week's n...
7   spam  WINNER!! As a valued network customer you have...
8   spam  Had your mobile 11 months or more? U R entitle...
10  spam  SIX chances to win CASH! From 100 to 20,000 po...
Training data shape: (5571, 2)
Testing data shape: (5571, 2)
Accuracy: 0.9876144318793754
Confusion matrix:
[[4795   29]
 [  40  707]]
Accuracy score: 0.4
Precision score: 0.3333333333333333
Recall score: 0.5
F1 score: 0.4
