# K-Nearest Neighbors (K-NN)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset
* 從三個資料夾中讀取資料

In [2]:
all_data = []
work_path = r'./D20_KNN 實作_spam_data/'
paths = [work_path + i for i in [r'spam', r'easy_ham', r'hard_ham']]
for path in paths:
    for fn in glob.glob(path+"/*"):
        #codecs.open可以避開錯誤，用errors='ignore'
        with codecs.open(fn, encoding='utf-8', errors='ignore') as file:
            for line in file:
                #這個line的開頭為Subject:
                if line.startswith("Subject:"):
                    subject=re.sub(r"^Subject:","",line).strip()
                    all_data.append([subject, 0 if "ham" in fn else 1])
all_data = np.array(all_data)
print(all_data)

[['Life Insurance - Why Pay More?' '1']
 ['[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206' '1']
 ['Guaranteed to lose 10-12 lbs in 30 days                          11.150'
  '1']
 ...
 ['NEWS: GNU/DEVELOPMENT... intl orgns take a close look at GNU/Linux'
  '0']
 ['Attn programmers: support offered [FLOSS-Sarai Initiative]' '0']
 ['(SPAM? 08.00) lists.sourceforge.net mailing list memberships reminder'
  '0']]


### 取出訓練內文與標註

In [3]:
X = all_data[:,0]
Y = all_data[:,1].astype(np.uint8)
print('Training Data Examples : \n{}'.format(X[:5]))
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Training Data Examples : 
['Life Insurance - Why Pay More?'
 '[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206'
 'Guaranteed to lose 10-12 lbs in 30 days                          11.150'
 'Re: Fw: User Name & Password to Membership To 5 Sites zzzz@example.com pviqg'
 '[ILUG-Social] re: Guaranteed to lose 10-12 lbs in 30 days 10.148']
Labeling Data Examples : 
[1 1 1 1 1]


### 文字預處理
* 細節可以參考前面章節

In [4]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

## 創建Lemmatizer
lemmatizer = WordNetLemmatizer() 
def get_wordnet_pos(word):
    """將pos_tag結果mapping到lemmatizer中pos的格式"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def clean_content(X):
    # remove non-alphabet characters
    X_clean = [re.sub('[^a-zA-Z]',' ', x).lower() for x in X]
    # tokenize
    X_word_tokenize = [nltk.word_tokenize(x) for x in X_clean]
    # stopwords_lemmatizer
    X_stopwords_lemmatizer = []
    stop_words = set(stopwords.words('english'))
    for content in X_word_tokenize:
        content_clean = []
        for word in content:
            if word not in stop_words:
                word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
                content_clean.append(word)
        X_stopwords_lemmatizer.append(content_clean)
    
    X_output = [' '.join(x) for x in X_stopwords_lemmatizer]
    
    return X_output

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
X = clean_content(X)

### Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的頻率高低去篩選，1500並沒有特別含義，大家可以自己嘗試不同數值或不加入限制
cv=CountVectorizer(max_features = 1500)
X=cv.fit_transform(X).toarray()
# 有 3423個樣本，每個樣本用1500維表示
print(X.shape)

(3423, 1500)


## 將資料拆成 train/test set

In [7]:
from sklearn.model_selection import train_test_split
# random_state 是為了讓各為學員得到相同的結果，平時可以移除
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the K-NN model on the Training set

In [8]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(X_train, Y_train)

KNeighborsClassifier()

## 測試 train/testset的 Accuracy

In [9]:
for i in ['train', 'test']:
    print(f'{i.capitalize()}set Accuracy: {classifier.score(globals()["X_"+i], globals()["Y_"+i])}')

Trainset Accuracy: 0.9086924762600438
Testset Accuracy: 0.8759124087591241


## 獲得 testset 上的結果

In [10]:
Y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
accuracy_score(Y_test, Y_pred)

[[579   8]
 [ 77  21]]


0.8759124087591241

### 運用K-fold尋找適合K值

In [12]:
# Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
n_neighbors  = [i for i in range(3, 38, 2)]  ## 可自行嘗試不同K值

for k in n_neighbors:
    classifier = KNeighborsClassifier(n_neighbors = k, metric = 'minkowski', p = 2)
    # cv = 10 代表切成10等分, n_jobs=-1，是指cpu全開
    accuracies = cross_val_score(estimator = classifier, X = X, y = Y, cv = 10, n_jobs=-1)
    
    print('設置K值:{}'.format(k))
    print('Average Accuracy: {}'.format(accuracies.mean()))
    print('Accuracy STD: {}'.format(accuracies.std()))

設置K值:3
Average Accuracy: 0.44366954801970915
Accuracy STD: 0.14897718005000501
設置K值:5
Average Accuracy: 0.8700066492762518
Accuracy STD: 0.014046083212433221
設置K值:7
Average Accuracy: 0.8670835251393791
Accuracy STD: 0.013823120027190177
設置K值:9
Average Accuracy: 0.8659147869674186
Accuracy STD: 0.011011851451384402
設置K值:11
Average Accuracy: 0.8659088196682181
Accuracy STD: 0.0052519830617875
設置K值:13
Average Accuracy: 0.8626932978705266
Accuracy STD: 0.005064855493576022
設置K值:15
Average Accuracy: 0.8612321620377473
Accuracy STD: 0.003993216319404961
設置K值:17
Average Accuracy: 0.859187083354645
Accuracy STD: 0.0022263371686399537
設置K值:19
Average Accuracy: 0.858312447786132
Accuracy STD: 0.003707730467409879
設置K值:21
Average Accuracy: 0.8562699265169726
Accuracy STD: 0.004169142892853119
設置K值:23
Average Accuracy: 0.8556842787240209
Accuracy STD: 0.0034123660055233166
設置K值:25
Average Accuracy: 0.8545146880807459
Accuracy STD: 0.0021027758039962948
設置K值:27
Average Accuracy: 0.8539307452304229
