There's a lot of imports scattered around, but I tried to use markdowns so no one would get lost.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
data = pd.read_csv('spamData.csv')

Here's where I used a cluster along with KMeans and make_blobs.

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
X, y = make_blobs(n_samples=3000, centers=5)
plt.scatter(X[:, 0], X[:, 1])
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2, n_init='auto')
y_pred = kmeans.fit_predict(X)

In [None]:
y_pred

In [None]:
silhouette_score(X, y_pred)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.show()

Here's where I decide to use a KNeighbors classifier.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state=0)

In [None]:
classifier = KNeighborsClassifier(4)

In [None]:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [None]:
print('Actual labels of the test set:')
print(y_test)
print('Predicted labels of the test set:')
print(predictions)

In [None]:
cm = confusion_matrix(y_test, predictions)
print('Confusion matrix:')
print(cm)

In [None]:
print('Precision:')
print(precision_score(y_test, predictions, average=None))
print('Recall:')
print(recall_score(y_test, predictions, average=None))

In [None]:
display = ConfusionMatrixDisplay(confusion_matrix=cm)
display.plot()
plt.show()

Here's where I used a decision tree classifier.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, confusion_matrix

In [None]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train,y_train)

In [None]:
y_pred = DTC.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print("Accuracy score : {:.2f}".format(accuracy_score(y_test,y_pred)))
print("Recall score : {:.2f}".format(recall_score(y_test,y_pred,average = 'macro', zero_division = True)))

Here's where the SVM and count vectorization trial went.

In [None]:
svm = pd.read_csv('spamData.csv')

In [None]:
svm.info()

In [None]:
svm.shape

In [None]:
svm.head()

In [None]:
svm.isna().sum()

In [None]:
plt.figure(figsize = (6, 6))
svm['Category'].value_counts().plot(kind = 'pie', autopct='%1.0f%%')
plt.title('Pie chart')

In [None]:
x = svm['Message'].values
y = svm['Category'].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.2, random_state=0)

In [None]:
cv = CountVectorizer() 
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

In [None]:
from sklearn.svm import SVC
model = SVC(random_state = 0)
model.fit(x_train, y_train)

In [None]:
model.score(x_test,y_test)

Here I used a MultinominalNB (Naive Bayes) model and added a bit of my own testing.

In [None]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [None]:
nbm = pd.read_csv("spamData.csv")
nbm.head()

In [None]:
nbm.groupby("Category").describe()

In [None]:
nbm['spam'] = nbm.Category.apply(lambda x: 1 if x == "spam" else 0)
nbm.head()

In [None]:
nbm = nbm.drop(["Category"],axis="columns")
nbm.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(nbm.Message, nbm.spam, test_size=0.25)

In [None]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

In [None]:
model = MultinomialNB()
model.fit(X_train_count,y_train)

In [None]:
email = [
    "Hiiiiiiii OMG lulz R U 2 Free 2nite?!;>",
    "Are you sure? He seems okay to me.",
    "WOW U can win 2 4 the price of 1!!!!"
]
email_count = v.transform(email)
model.predict(email_count)

In [None]:
X_test_count = v.transform(X_test)
model.score(X_test_count,y_test)