In [48]:
import os
import tarfile
import urllib.request
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

download_root = "http://spamassassin.apache.org/old/publiccorpus/"
ham_url = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
spam_url = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
spam_path = os.path.join("datasets", "spam")
if not os.path.isdir(spam_path):
    os.makedirs(spam_path)
for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
    path = os.path.join(spam_path, filename)
    if not os.path.isfile(path):
        urllib.request.urlretrieve(url, path)
    tar_bz2_file = tarfile.open(path)
    tar_bz2_file.extractall(path=spam_path)
    tar_bz2_file.close()
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [3]:
len(ham_filenames), len(spam_filenames)

(2500, 500)

In [6]:
import email
import email.policy

def load_email(is_spam, filename):
    global spam_path
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
ham_email = [load_email(False, i) for i in ham_filenames]
spam_email = [load_email(True, i) for i in spam_filenames]

In [26]:
words = set()
with open("google-10000-english-usa(1).txt", "r") as f:
    word = f.readline()
    while word:
        words.add(word.strip())
        word = f.readline()
len(words)

9989

In [35]:
ham_email_v = np.array([[1 if i in mail else 0 for i in words] for mail in ham_email])
spam_email_v = np.array([[1 if i in mail else 0 for i in words] for mail in spam_email])
ham_email_v.shape, spam_email_v.shape

((2500, 9989), (500, 9989))

In [47]:
X_train, y_train, X_test, y_test = np.concatenate((ham_email_v[:2000], spam_email_v[:400])), np.concatenate((np.array([0]*2000), np.array([1]*400))), np.concatenate((ham_email_v[2000:], spam_email_v[400:])), np.concatenate((np.array([0]*500), np.array([1]*100)))
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2400, 9989), (2400,), (600, 9989), (600,))

In [50]:
cv = GridSearchCV(KNeighborsClassifier(), 
                  verbose=3, 
                  param_grid={"weights":["uniform", "distance"], "n_neighbors": list(range(2, 10))},
)
cv.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ....n_neighbors=2, weights=uniform;, score=0.806 total time=   0.4s
[CV 2/5] END ....n_neighbors=2, weights=uniform;, score=0.833 total time=   0.5s
[CV 3/5] END ....n_neighbors=2, weights=uniform;, score=0.833 total time=   0.4s
[CV 4/5] END ....n_neighbors=2, weights=uniform;, score=0.844 total time=   0.5s
[CV 5/5] END ....n_neighbors=2, weights=uniform;, score=0.838 total time=   0.5s
[CV 1/5] END ...n_neighbors=2, weights=distance;, score=0.806 total time=   0.4s
[CV 2/5] END ...n_neighbors=2, weights=distance;, score=0.833 total time=   0.4s
[CV 3/5] END ...n_neighbors=2, weights=distance;, score=0.833 total time=   0.4s
[CV 4/5] END ...n_neighbors=2, weights=distance;, score=0.844 total time=   0.4s
[CV 5/5] END ...n_neighbors=2, weights=distance;, score=0.838 total time=   0.4s
[CV 1/5] END ....n_neighbors=3, weights=uniform;, score=0.806 total time=   0.4s
[CV 2/5] END ....n_neighbors=3, weights=uniform;

In [51]:
cv.best_params_, cv.best_score_

({'n_neighbors': 3, 'weights': 'uniform'}, 0.8320833333333333)