In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('cleaned_spam_dataset.csv')

# Convert "label" column to bool type
df['label'] = df['label'].astype(bool)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create a KNN classifier
knn_classifier = KNeighborsClassifier()

# Define a parameter grid to search for the best 'k' value
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

# Create a GridSearchCV object
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_tfidf, y_train)

# Get the best 'k' value
best_k = grid_search.best_params_['n_neighbors']

# Create a new KNN classifier with the best 'k'
best_knn_classifier = KNeighborsClassifier(n_neighbors=best_k)

# Fit the model with the best 'k' to the training data
best_knn_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = best_knn_classifier.predict(X_test_tfidf)

# Print the accuracy of the tuned model
accuracy = accuracy_score(y_test, y_pred)

print(f"Best 'k' value: {best_k}")
print(f"Accuracy on the test set: {accuracy}")


Best 'k' value: 3
Accuracy on the test set: 0.8145627559624187
